diff mbox series

selftests/mm: Introduce a test program to assess swap entry allocation for thp_swapout

Message ID 20240620002648.75204-1-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series selftests/mm: Introduce a test program to assess swap entry allocation for thp_swapout | expand

Commit Message

Barry Song June 20, 2024, 12:26 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

Both Ryan and Chris have been utilizing the small test program to aid
in debugging and identifying issues with swap entry allocation. While
a real or intricate workload might be more suitable for assessing the
correctness and effectiveness of the swap allocation policy, a small
test program presents a simpler means of understanding the problem and
initially verifying the improvements being made.

Let's endeavor to integrate it into the self-test suite. Although it
presently only accommodates 64KB and 4KB, I'm optimistic that we can
expand its capabilities to support multiple sizes and simulate more
complex systems in the future as required.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 tools/testing/selftests/mm/Makefile           |   1 +
 .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c

Comments

Huang, Ying June 20, 2024, 1:53 a.m. UTC | #1
Barry Song <21cnbao@gmail.com> writes:

> From: Barry Song <v-songbaohua@oppo.com>
>
> Both Ryan and Chris have been utilizing the small test program to aid
> in debugging and identifying issues with swap entry allocation. While
> a real or intricate workload might be more suitable for assessing the
> correctness and effectiveness of the swap allocation policy, a small
> test program presents a simpler means of understanding the problem and
> initially verifying the improvements being made.
>
> Let's endeavor to integrate it into the self-test suite. Although it
> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> expand its capabilities to support multiple sizes and simulate more
> complex systems in the future as required.

IIUC, this is a performance test program instead of functionality test
program.  Does it match the purpose of the kernel selftest?

> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>  2 files changed, 193 insertions(+)
>  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index e1aa09ddaa3d..64164ad66835 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>  TEST_GEN_FILES += seal_elf
>  TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += pagemap_ioctl
> +TEST_GEN_FILES += thp_swap_allocator_test
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += uffd-stress
> diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> new file mode 100644
> index 000000000000..4443a906d0f8
> --- /dev/null
> +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> @@ -0,0 +1,192 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * thp_swap_allocator_test
> + *
> + * The purpose of this test program is helping check if THP swpout
> + * can correctly get swap slots to swap out as a whole instead of
> + * being split. It randomly releases swap entries through madvise
> + * DONTNEED and do swapout on two memory areas: a memory area for
> + * 64KB THP and the other area for small folios. The second memory
> + * can be enabled by "-s".
> + * Before running the program, we need to setup a zRAM or similar
> + * swap device by:
> + *  echo lzo > /sys/block/zram0/comp_algorithm
> + *  echo 64M > /sys/block/zram0/disksize
> + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> + *  mkswap /dev/zram0
> + *  swapon /dev/zram0
> + * The expected result should be 0% anon swpout fallback ratio w/ or
> + * w/o "-s".
> + *
> + * Author(s): Barry Song <v-songbaohua@oppo.com>
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <errno.h>
> +#include <time.h>
> +
> +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> +#define ALIGNMENT_MTHP (64 * 1024)
> +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> +#define MTHP_FOLIO_SIZE (64 * 1024)
> +
> +#define SWPOUT_PATH \
> +	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> +#define SWPOUT_FALLBACK_PATH \
> +	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> +
> +static void *aligned_alloc_mem(size_t size, size_t alignment)
> +{
> +	void *mem = NULL;
> +
> +	if (posix_memalign(&mem, alignment, size) != 0) {
> +		perror("posix_memalign");
> +		return NULL;
> +	}
> +	return mem;
> +}
> +
> +static void random_madvise_dontneed(void *mem, size_t mem_size,
> +		size_t align_size, size_t total_dontneed_size)
> +{
> +	size_t num_pages = total_dontneed_size / align_size;
> +	size_t i;
> +	size_t offset;
> +	void *addr;
> +
> +	for (i = 0; i < num_pages; ++i) {
> +		offset = (rand() % (mem_size / align_size)) * align_size;
> +		addr = (char *)mem + offset;
> +		if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> +			perror("madvise dontneed");

IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
simulate the effect of large size swap-in when it's not available in
kernel.  If we have large size swap-in in kernel in the future, this
becomes unnecessary.

Additionally, we have not reached the consensus that we should always
swap-in with swapped-out size.  So, I suspect that this test may not
reflect real situation in the future.  Although it doesn't reflect
current situation too.

> +
> +		memset(addr, 0x11, align_size);
> +	}
> +}
> +

[snip]

--
Best Regards,
Huang, Ying
Barry Song June 20, 2024, 2:04 a.m. UTC | #2
On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > Both Ryan and Chris have been utilizing the small test program to aid
> > in debugging and identifying issues with swap entry allocation. While
> > a real or intricate workload might be more suitable for assessing the
> > correctness and effectiveness of the swap allocation policy, a small
> > test program presents a simpler means of understanding the problem and
> > initially verifying the improvements being made.
> >
> > Let's endeavor to integrate it into the self-test suite. Although it
> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
> > expand its capabilities to support multiple sizes and simulate more
> > complex systems in the future as required.
>
> IIUC, this is a performance test program instead of functionality test
> program.  Does it match the purpose of the kernel selftest?

I have a differing perspective. I maintain that the functionality is
not functioning
as expected. Despite having all the necessary resources for allocation, failure
persists, indicating a lack of functionality.

>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  tools/testing/selftests/mm/Makefile           |   1 +
> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
> >  2 files changed, 193 insertions(+)
> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> >
> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> > index e1aa09ddaa3d..64164ad66835 100644
> > --- a/tools/testing/selftests/mm/Makefile
> > +++ b/tools/testing/selftests/mm/Makefile
> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
> >  TEST_GEN_FILES += seal_elf
> >  TEST_GEN_FILES += on-fault-limit
> >  TEST_GEN_FILES += pagemap_ioctl
> > +TEST_GEN_FILES += thp_swap_allocator_test
> >  TEST_GEN_FILES += thuge-gen
> >  TEST_GEN_FILES += transhuge-stress
> >  TEST_GEN_FILES += uffd-stress
> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> > new file mode 100644
> > index 000000000000..4443a906d0f8
> > --- /dev/null
> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> > @@ -0,0 +1,192 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * thp_swap_allocator_test
> > + *
> > + * The purpose of this test program is helping check if THP swpout
> > + * can correctly get swap slots to swap out as a whole instead of
> > + * being split. It randomly releases swap entries through madvise
> > + * DONTNEED and do swapout on two memory areas: a memory area for
> > + * 64KB THP and the other area for small folios. The second memory
> > + * can be enabled by "-s".
> > + * Before running the program, we need to setup a zRAM or similar
> > + * swap device by:
> > + *  echo lzo > /sys/block/zram0/comp_algorithm
> > + *  echo 64M > /sys/block/zram0/disksize
> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> > + *  mkswap /dev/zram0
> > + *  swapon /dev/zram0
> > + * The expected result should be 0% anon swpout fallback ratio w/ or
> > + * w/o "-s".
> > + *
> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
> > + */
> > +
> > +#define _GNU_SOURCE
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <unistd.h>
> > +#include <string.h>
> > +#include <sys/mman.h>
> > +#include <errno.h>
> > +#include <time.h>
> > +
> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> > +#define ALIGNMENT_MTHP (64 * 1024)
> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> > +#define MTHP_FOLIO_SIZE (64 * 1024)
> > +
> > +#define SWPOUT_PATH \
> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> > +#define SWPOUT_FALLBACK_PATH \
> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> > +
> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
> > +{
> > +     void *mem = NULL;
> > +
> > +     if (posix_memalign(&mem, alignment, size) != 0) {
> > +             perror("posix_memalign");
> > +             return NULL;
> > +     }
> > +     return mem;
> > +}
> > +
> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
> > +             size_t align_size, size_t total_dontneed_size)
> > +{
> > +     size_t num_pages = total_dontneed_size / align_size;
> > +     size_t i;
> > +     size_t offset;
> > +     void *addr;
> > +
> > +     for (i = 0; i < num_pages; ++i) {
> > +             offset = (rand() % (mem_size / align_size)) * align_size;
> > +             addr = (char *)mem + offset;
> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> > +                     perror("madvise dontneed");
>
> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
> simulate the effect of large size swap-in when it's not available in
> kernel.  If we have large size swap-in in kernel in the future, this
> becomes unnecessary.
>
> Additionally, we have not reached the consensus that we should always
> swap-in with swapped-out size.  So, I suspect that this test may not
> reflect real situation in the future.  Although it doesn't reflect
> current situation too.

Disagree again. releasing the whole mTHP swaps is the best case. Even in
the best-case scenario, if we fail, it raises concerns for handling potentially
more challenging situations.

I don't find it hard to incorporate additional features into this test
program to simulate more intricate scenarios.

>
> > +
> > +             memset(addr, 0x11, align_size);
> > +     }
> > +}
> > +
>
> [snip]
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Huang, Ying June 20, 2024, 5:20 a.m. UTC | #3
Barry Song <21cnbao@gmail.com> writes:

> On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > From: Barry Song <v-songbaohua@oppo.com>
>> >
>> > Both Ryan and Chris have been utilizing the small test program to aid
>> > in debugging and identifying issues with swap entry allocation. While
>> > a real or intricate workload might be more suitable for assessing the
>> > correctness and effectiveness of the swap allocation policy, a small
>> > test program presents a simpler means of understanding the problem and
>> > initially verifying the improvements being made.
>> >
>> > Let's endeavor to integrate it into the self-test suite. Although it
>> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> > expand its capabilities to support multiple sizes and simulate more
>> > complex systems in the future as required.
>>
>> IIUC, this is a performance test program instead of functionality test
>> program.  Does it match the purpose of the kernel selftest?
>
> I have a differing perspective. I maintain that the functionality is
> not functioning
> as expected. Despite having all the necessary resources for allocation, failure
> persists, indicating a lack of functionality.

Is there any user visual functionality issue?

>>
>> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> > ---
>> >  tools/testing/selftests/mm/Makefile           |   1 +
>> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>> >  2 files changed, 193 insertions(+)
>> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >
>> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
>> > index e1aa09ddaa3d..64164ad66835 100644
>> > --- a/tools/testing/selftests/mm/Makefile
>> > +++ b/tools/testing/selftests/mm/Makefile
>> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>> >  TEST_GEN_FILES += seal_elf
>> >  TEST_GEN_FILES += on-fault-limit
>> >  TEST_GEN_FILES += pagemap_ioctl
>> > +TEST_GEN_FILES += thp_swap_allocator_test
>> >  TEST_GEN_FILES += thuge-gen
>> >  TEST_GEN_FILES += transhuge-stress
>> >  TEST_GEN_FILES += uffd-stress
>> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> > new file mode 100644
>> > index 000000000000..4443a906d0f8
>> > --- /dev/null
>> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> > @@ -0,0 +1,192 @@
>> > +// SPDX-License-Identifier: GPL-2.0-or-later
>> > +/*
>> > + * thp_swap_allocator_test
>> > + *
>> > + * The purpose of this test program is helping check if THP swpout
>> > + * can correctly get swap slots to swap out as a whole instead of
>> > + * being split. It randomly releases swap entries through madvise
>> > + * DONTNEED and do swapout on two memory areas: a memory area for
>> > + * 64KB THP and the other area for small folios. The second memory
>> > + * can be enabled by "-s".
>> > + * Before running the program, we need to setup a zRAM or similar
>> > + * swap device by:
>> > + *  echo lzo > /sys/block/zram0/comp_algorithm
>> > + *  echo 64M > /sys/block/zram0/disksize
>> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
>> > + *  mkswap /dev/zram0
>> > + *  swapon /dev/zram0
>> > + * The expected result should be 0% anon swpout fallback ratio w/ or
>> > + * w/o "-s".
>> > + *
>> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
>> > + */
>> > +
>> > +#define _GNU_SOURCE
>> > +#include <stdio.h>
>> > +#include <stdlib.h>
>> > +#include <unistd.h>
>> > +#include <string.h>
>> > +#include <sys/mman.h>
>> > +#include <errno.h>
>> > +#include <time.h>
>> > +
>> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
>> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
>> > +#define ALIGNMENT_MTHP (64 * 1024)
>> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
>> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
>> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
>> > +#define MTHP_FOLIO_SIZE (64 * 1024)
>> > +
>> > +#define SWPOUT_PATH \
>> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
>> > +#define SWPOUT_FALLBACK_PATH \
>> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
>> > +
>> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
>> > +{
>> > +     void *mem = NULL;
>> > +
>> > +     if (posix_memalign(&mem, alignment, size) != 0) {
>> > +             perror("posix_memalign");
>> > +             return NULL;
>> > +     }
>> > +     return mem;
>> > +}
>> > +
>> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
>> > +             size_t align_size, size_t total_dontneed_size)
>> > +{
>> > +     size_t num_pages = total_dontneed_size / align_size;
>> > +     size_t i;
>> > +     size_t offset;
>> > +     void *addr;
>> > +
>> > +     for (i = 0; i < num_pages; ++i) {
>> > +             offset = (rand() % (mem_size / align_size)) * align_size;
>> > +             addr = (char *)mem + offset;
>> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
>> > +                     perror("madvise dontneed");
>>
>> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
>> simulate the effect of large size swap-in when it's not available in
>> kernel.  If we have large size swap-in in kernel in the future, this
>> becomes unnecessary.
>>
>> Additionally, we have not reached the consensus that we should always
>> swap-in with swapped-out size.  So, I suspect that this test may not
>> reflect real situation in the future.  Although it doesn't reflect
>> current situation too.
>
> Disagree again. releasing the whole mTHP swaps is the best case. Even in
> the best-case scenario, if we fail, it raises concerns for handling potentially
> more challenging situations.

Repeating sequential anonymous pages writing is the best case.

> I don't find it hard to incorporate additional features into this test
> program to simulate more intricate scenarios.

IMHO, we don't really need this special purpose test.  We can have some
more general basic tests, for example, sequential anonymous pages
writing/reading, random anonymous pages writing/reading, and combination
of them.

--
Best Regards,
Huang, Ying

>>
>> > +
>> > +             memset(addr, 0x11, align_size);
>> > +     }
>> > +}
>> > +
>>
>> [snip]
>>
>> --
>> Best Regards,
>> Huang, Ying
>
> Thanks
> Barry
Barry Song June 20, 2024, 6:09 a.m. UTC | #4
On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > From: Barry Song <v-songbaohua@oppo.com>
> >> >
> >> > Both Ryan and Chris have been utilizing the small test program to aid
> >> > in debugging and identifying issues with swap entry allocation. While
> >> > a real or intricate workload might be more suitable for assessing the
> >> > correctness and effectiveness of the swap allocation policy, a small
> >> > test program presents a simpler means of understanding the problem and
> >> > initially verifying the improvements being made.
> >> >
> >> > Let's endeavor to integrate it into the self-test suite. Although it
> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> > expand its capabilities to support multiple sizes and simulate more
> >> > complex systems in the future as required.
> >>
> >> IIUC, this is a performance test program instead of functionality test
> >> program.  Does it match the purpose of the kernel selftest?
> >
> > I have a differing perspective. I maintain that the functionality is
> > not functioning
> > as expected. Despite having all the necessary resources for allocation, failure
> > persists, indicating a lack of functionality.
>
> Is there any user visual functionality issue?

Definitely not. If a plane can't take off, taking a train and pretending
there's no functionality issue isn't a solution.

I have never assigned blame for any mistakes here. On the contrary,
I have 100% appreciation for Ryan's work in at least initiating mTHP
swapout w/o being split.

It took countless experiments for humans to make airplanes commercially
viable, but the person who created the first flying airplane remains the
greatest. Similarly, Ryan's efforts, combined with your review of his patch,
have enabled us to achieve a better goal here. Without your work, we can't
get here at all.

However, this is never a reason to refuse to acknowledge that this feature
is not actually working.

>
> >>
> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> > ---
> >> >  tools/testing/selftests/mm/Makefile           |   1 +
> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
> >> >  2 files changed, 193 insertions(+)
> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >
> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> >> > index e1aa09ddaa3d..64164ad66835 100644
> >> > --- a/tools/testing/selftests/mm/Makefile
> >> > +++ b/tools/testing/selftests/mm/Makefile
> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
> >> >  TEST_GEN_FILES += seal_elf
> >> >  TEST_GEN_FILES += on-fault-limit
> >> >  TEST_GEN_FILES += pagemap_ioctl
> >> > +TEST_GEN_FILES += thp_swap_allocator_test
> >> >  TEST_GEN_FILES += thuge-gen
> >> >  TEST_GEN_FILES += transhuge-stress
> >> >  TEST_GEN_FILES += uffd-stress
> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> > new file mode 100644
> >> > index 000000000000..4443a906d0f8
> >> > --- /dev/null
> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> > @@ -0,0 +1,192 @@
> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
> >> > +/*
> >> > + * thp_swap_allocator_test
> >> > + *
> >> > + * The purpose of this test program is helping check if THP swpout
> >> > + * can correctly get swap slots to swap out as a whole instead of
> >> > + * being split. It randomly releases swap entries through madvise
> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
> >> > + * 64KB THP and the other area for small folios. The second memory
> >> > + * can be enabled by "-s".
> >> > + * Before running the program, we need to setup a zRAM or similar
> >> > + * swap device by:
> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
> >> > + *  echo 64M > /sys/block/zram0/disksize
> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> >> > + *  mkswap /dev/zram0
> >> > + *  swapon /dev/zram0
> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
> >> > + * w/o "-s".
> >> > + *
> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
> >> > + */
> >> > +
> >> > +#define _GNU_SOURCE
> >> > +#include <stdio.h>
> >> > +#include <stdlib.h>
> >> > +#include <unistd.h>
> >> > +#include <string.h>
> >> > +#include <sys/mman.h>
> >> > +#include <errno.h>
> >> > +#include <time.h>
> >> > +
> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> >> > +#define ALIGNMENT_MTHP (64 * 1024)
> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
> >> > +
> >> > +#define SWPOUT_PATH \
> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> >> > +#define SWPOUT_FALLBACK_PATH \
> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> >> > +
> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
> >> > +{
> >> > +     void *mem = NULL;
> >> > +
> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
> >> > +             perror("posix_memalign");
> >> > +             return NULL;
> >> > +     }
> >> > +     return mem;
> >> > +}
> >> > +
> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
> >> > +             size_t align_size, size_t total_dontneed_size)
> >> > +{
> >> > +     size_t num_pages = total_dontneed_size / align_size;
> >> > +     size_t i;
> >> > +     size_t offset;
> >> > +     void *addr;
> >> > +
> >> > +     for (i = 0; i < num_pages; ++i) {
> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
> >> > +             addr = (char *)mem + offset;
> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> >> > +                     perror("madvise dontneed");
> >>
> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
> >> simulate the effect of large size swap-in when it's not available in
> >> kernel.  If we have large size swap-in in kernel in the future, this
> >> becomes unnecessary.
> >>
> >> Additionally, we have not reached the consensus that we should always
> >> swap-in with swapped-out size.  So, I suspect that this test may not
> >> reflect real situation in the future.  Although it doesn't reflect
> >> current situation too.
> >
> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
> > the best-case scenario, if we fail, it raises concerns for handling potentially
> > more challenging situations.
>
> Repeating sequential anonymous pages writing is the best case.

I define the best case as the scenario with the least chance of creating
fragments within swapfiles for mTHP to swap out. There is no real
difference whether this is done through swapin or madv_dontneed.

>
> > I don't find it hard to incorporate additional features into this test
> > program to simulate more intricate scenarios.
>
> IMHO, we don't really need this special purpose test.  We can have some
> more general basic tests, for example, sequential anonymous pages
> writing/reading, random anonymous pages writing/reading, and combination
> of them.

I understand that not all things will be loved by all people. However, before
I sent this patch, Chris mentioned that it has been very helpful for him and
strongly suggested that I contribute it to the self-test suite.

By the way, adding sequential and random anonymous pages for
read/write operations is definitely in my plan. The absence of this feature
isn't a convincing reason to disregard it.

>
> --
> Best Regards,
> Huang, Ying
>
> >>
> >> > +
> >> > +             memset(addr, 0x11, align_size);
> >> > +     }
> >> > +}
> >> > +
> >>
> >> [snip]
> >>
> >> --
> >> Best Regards,
> >> Huang, Ying

Thanks
Barry
Huang, Ying June 20, 2024, 6:34 a.m. UTC | #5
Barry Song <21cnbao@gmail.com> writes:

> On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> > From: Barry Song <v-songbaohua@oppo.com>
>> >> >
>> >> > Both Ryan and Chris have been utilizing the small test program to aid
>> >> > in debugging and identifying issues with swap entry allocation. While
>> >> > a real or intricate workload might be more suitable for assessing the
>> >> > correctness and effectiveness of the swap allocation policy, a small
>> >> > test program presents a simpler means of understanding the problem and
>> >> > initially verifying the improvements being made.
>> >> >
>> >> > Let's endeavor to integrate it into the self-test suite. Although it
>> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >> > expand its capabilities to support multiple sizes and simulate more
>> >> > complex systems in the future as required.
>> >>
>> >> IIUC, this is a performance test program instead of functionality test
>> >> program.  Does it match the purpose of the kernel selftest?
>> >
>> > I have a differing perspective. I maintain that the functionality is
>> > not functioning
>> > as expected. Despite having all the necessary resources for allocation, failure
>> > persists, indicating a lack of functionality.
>>
>> Is there any user visual functionality issue?
>
> Definitely not. If a plane can't take off, taking a train and pretending
> there's no functionality issue isn't a solution.

I always think that performance optimization is great work.  However, it
is not functionality work.

> I have never assigned blame for any mistakes here. On the contrary,
> I have 100% appreciation for Ryan's work in at least initiating mTHP
> swapout w/o being split.
>
> It took countless experiments for humans to make airplanes commercially
> viable, but the person who created the first flying airplane remains the
> greatest. Similarly, Ryan's efforts, combined with your review of his patch,
> have enabled us to achieve a better goal here. Without your work, we can't
> get here at all.

Thanks!

> However, this is never a reason to refuse to acknowledge that this feature
> is not actually working.

It just works for some workloads, not for some others.

>>
>> >>
>> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> >> > ---
>> >> >  tools/testing/selftests/mm/Makefile           |   1 +
>> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>> >> >  2 files changed, 193 insertions(+)
>> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >
>> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
>> >> > index e1aa09ddaa3d..64164ad66835 100644
>> >> > --- a/tools/testing/selftests/mm/Makefile
>> >> > +++ b/tools/testing/selftests/mm/Makefile
>> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>> >> >  TEST_GEN_FILES += seal_elf
>> >> >  TEST_GEN_FILES += on-fault-limit
>> >> >  TEST_GEN_FILES += pagemap_ioctl
>> >> > +TEST_GEN_FILES += thp_swap_allocator_test
>> >> >  TEST_GEN_FILES += thuge-gen
>> >> >  TEST_GEN_FILES += transhuge-stress
>> >> >  TEST_GEN_FILES += uffd-stress
>> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> > new file mode 100644
>> >> > index 000000000000..4443a906d0f8
>> >> > --- /dev/null
>> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> > @@ -0,0 +1,192 @@
>> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
>> >> > +/*
>> >> > + * thp_swap_allocator_test
>> >> > + *
>> >> > + * The purpose of this test program is helping check if THP swpout
>> >> > + * can correctly get swap slots to swap out as a whole instead of
>> >> > + * being split. It randomly releases swap entries through madvise
>> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
>> >> > + * 64KB THP and the other area for small folios. The second memory
>> >> > + * can be enabled by "-s".
>> >> > + * Before running the program, we need to setup a zRAM or similar
>> >> > + * swap device by:
>> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
>> >> > + *  echo 64M > /sys/block/zram0/disksize
>> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
>> >> > + *  mkswap /dev/zram0
>> >> > + *  swapon /dev/zram0
>> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
>> >> > + * w/o "-s".
>> >> > + *
>> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
>> >> > + */
>> >> > +
>> >> > +#define _GNU_SOURCE
>> >> > +#include <stdio.h>
>> >> > +#include <stdlib.h>
>> >> > +#include <unistd.h>
>> >> > +#include <string.h>
>> >> > +#include <sys/mman.h>
>> >> > +#include <errno.h>
>> >> > +#include <time.h>
>> >> > +
>> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
>> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
>> >> > +#define ALIGNMENT_MTHP (64 * 1024)
>> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
>> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
>> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
>> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
>> >> > +
>> >> > +#define SWPOUT_PATH \
>> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
>> >> > +#define SWPOUT_FALLBACK_PATH \
>> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
>> >> > +
>> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
>> >> > +{
>> >> > +     void *mem = NULL;
>> >> > +
>> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
>> >> > +             perror("posix_memalign");
>> >> > +             return NULL;
>> >> > +     }
>> >> > +     return mem;
>> >> > +}
>> >> > +
>> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
>> >> > +             size_t align_size, size_t total_dontneed_size)
>> >> > +{
>> >> > +     size_t num_pages = total_dontneed_size / align_size;
>> >> > +     size_t i;
>> >> > +     size_t offset;
>> >> > +     void *addr;
>> >> > +
>> >> > +     for (i = 0; i < num_pages; ++i) {
>> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
>> >> > +             addr = (char *)mem + offset;
>> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
>> >> > +                     perror("madvise dontneed");
>> >>
>> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
>> >> simulate the effect of large size swap-in when it's not available in
>> >> kernel.  If we have large size swap-in in kernel in the future, this
>> >> becomes unnecessary.
>> >>
>> >> Additionally, we have not reached the consensus that we should always
>> >> swap-in with swapped-out size.  So, I suspect that this test may not
>> >> reflect real situation in the future.  Although it doesn't reflect
>> >> current situation too.
>> >
>> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
>> > the best-case scenario, if we fail, it raises concerns for handling potentially
>> > more challenging situations.
>>
>> Repeating sequential anonymous pages writing is the best case.
>
> I define the best case as the scenario with the least chance of creating
> fragments within swapfiles for mTHP to swap out. There is no real
> difference whether this is done through swapin or madv_dontneed.

IMO, swapin is much more important than madv_dontneed.  Because most
users use swapin automatically, but few use madv_dontneed by hand.  So,
I think swapin/swapout test is much more important than madv_dontneed.
I don't like this test case because madv_dontneed isn't typical or
basic.

>>
>> > I don't find it hard to incorporate additional features into this test
>> > program to simulate more intricate scenarios.
>>
>> IMHO, we don't really need this special purpose test.  We can have some
>> more general basic tests, for example, sequential anonymous pages
>> writing/reading, random anonymous pages writing/reading, and combination
>> of them.
>
> I understand that not all things will be loved by all people. However, before
> I sent this patch, Chris mentioned that it has been very helpful for him and
> strongly suggested that I contribute it to the self-test suite.
>
> By the way, adding sequential and random anonymous pages for
> read/write operations is definitely in my plan. The absence of this feature
> isn't a convincing reason to disregard it.
>

[snip]

--
Best Regards,
Huang, Ying
David Hildenbrand June 20, 2024, 7:20 a.m. UTC | #6
On 20.06.24 03:53, Huang, Ying wrote:
> Barry Song <21cnbao@gmail.com> writes:
> 
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> Both Ryan and Chris have been utilizing the small test program to aid
>> in debugging and identifying issues with swap entry allocation. While
>> a real or intricate workload might be more suitable for assessing the
>> correctness and effectiveness of the swap allocation policy, a small
>> test program presents a simpler means of understanding the problem and
>> initially verifying the improvements being made.
>>
>> Let's endeavor to integrate it into the self-test suite. Although it
>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> expand its capabilities to support multiple sizes and simulate more
>> complex systems in the future as required.
> 
> IIUC, this is a performance test program instead of functionality test
> program.  Does it match the purpose of the kernel selftest?

We do have the similar tests at least for ksm (ksm_tests.c) and probably 
others:

$ git grep -l clock_gettime
ksm_tests.c
migration.c
mremap_test.c
transhuge-stress.c


I recall that gup_test.c also measures performance things.
Huang, Ying June 20, 2024, 7:24 a.m. UTC | #7
David Hildenbrand <david@redhat.com> writes:

> On 20.06.24 03:53, Huang, Ying wrote:
>> Barry Song <21cnbao@gmail.com> writes:
>> 
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> Both Ryan and Chris have been utilizing the small test program to aid
>>> in debugging and identifying issues with swap entry allocation. While
>>> a real or intricate workload might be more suitable for assessing the
>>> correctness and effectiveness of the swap allocation policy, a small
>>> test program presents a simpler means of understanding the problem and
>>> initially verifying the improvements being made.
>>>
>>> Let's endeavor to integrate it into the self-test suite. Although it
>>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>>> expand its capabilities to support multiple sizes and simulate more
>>> complex systems in the future as required.
>> IIUC, this is a performance test program instead of functionality
>> test
>> program.  Does it match the purpose of the kernel selftest?
>
> We do have the similar tests at least for ksm (ksm_tests.c) and
> probably others:
>
> $ git grep -l clock_gettime
> ksm_tests.c
> migration.c
> mremap_test.c
> transhuge-stress.c
>
>
> I recall that gup_test.c also measures performance things.

Good to know that!  Thanks for your information!

--
Best Regards,
Huang, Ying
Barry Song June 20, 2024, 7:25 a.m. UTC | #8
On Thu, Jun 20, 2024 at 6:36 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> > From: Barry Song <v-songbaohua@oppo.com>
> >> >> >
> >> >> > Both Ryan and Chris have been utilizing the small test program to aid
> >> >> > in debugging and identifying issues with swap entry allocation. While
> >> >> > a real or intricate workload might be more suitable for assessing the
> >> >> > correctness and effectiveness of the swap allocation policy, a small
> >> >> > test program presents a simpler means of understanding the problem and
> >> >> > initially verifying the improvements being made.
> >> >> >
> >> >> > Let's endeavor to integrate it into the self-test suite. Although it
> >> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >> > expand its capabilities to support multiple sizes and simulate more
> >> >> > complex systems in the future as required.
> >> >>
> >> >> IIUC, this is a performance test program instead of functionality test
> >> >> program.  Does it match the purpose of the kernel selftest?
> >> >
> >> > I have a differing perspective. I maintain that the functionality is
> >> > not functioning
> >> > as expected. Despite having all the necessary resources for allocation, failure
> >> > persists, indicating a lack of functionality.
> >>
> >> Is there any user visual functionality issue?
> >
> > Definitely not. If a plane can't take off, taking a train and pretending
> > there's no functionality issue isn't a solution.
>
> I always think that performance optimization is great work.  However, it
> is not functionality work.
>
> > I have never assigned blame for any mistakes here. On the contrary,
> > I have 100% appreciation for Ryan's work in at least initiating mTHP
> > swapout w/o being split.
> >
> > It took countless experiments for humans to make airplanes commercially
> > viable, but the person who created the first flying airplane remains the
> > greatest. Similarly, Ryan's efforts, combined with your review of his patch,
> > have enabled us to achieve a better goal here. Without your work, we can't
> > get here at all.
>
> Thanks!
>
> > However, this is never a reason to refuse to acknowledge that this feature
> > is not actually working.
>
> It just works for some workloads, not for some others.
>
> >>
> >> >>
> >> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> >> > ---
> >> >> >  tools/testing/selftests/mm/Makefile           |   1 +
> >> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
> >> >> >  2 files changed, 193 insertions(+)
> >> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >
> >> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> >> >> > index e1aa09ddaa3d..64164ad66835 100644
> >> >> > --- a/tools/testing/selftests/mm/Makefile
> >> >> > +++ b/tools/testing/selftests/mm/Makefile
> >> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
> >> >> >  TEST_GEN_FILES += seal_elf
> >> >> >  TEST_GEN_FILES += on-fault-limit
> >> >> >  TEST_GEN_FILES += pagemap_ioctl
> >> >> > +TEST_GEN_FILES += thp_swap_allocator_test
> >> >> >  TEST_GEN_FILES += thuge-gen
> >> >> >  TEST_GEN_FILES += transhuge-stress
> >> >> >  TEST_GEN_FILES += uffd-stress
> >> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> > new file mode 100644
> >> >> > index 000000000000..4443a906d0f8
> >> >> > --- /dev/null
> >> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> > @@ -0,0 +1,192 @@
> >> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
> >> >> > +/*
> >> >> > + * thp_swap_allocator_test
> >> >> > + *
> >> >> > + * The purpose of this test program is helping check if THP swpout
> >> >> > + * can correctly get swap slots to swap out as a whole instead of
> >> >> > + * being split. It randomly releases swap entries through madvise
> >> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
> >> >> > + * 64KB THP and the other area for small folios. The second memory
> >> >> > + * can be enabled by "-s".
> >> >> > + * Before running the program, we need to setup a zRAM or similar
> >> >> > + * swap device by:
> >> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
> >> >> > + *  echo 64M > /sys/block/zram0/disksize
> >> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> >> >> > + *  mkswap /dev/zram0
> >> >> > + *  swapon /dev/zram0
> >> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
> >> >> > + * w/o "-s".
> >> >> > + *
> >> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
> >> >> > + */
> >> >> > +
> >> >> > +#define _GNU_SOURCE
> >> >> > +#include <stdio.h>
> >> >> > +#include <stdlib.h>
> >> >> > +#include <unistd.h>
> >> >> > +#include <string.h>
> >> >> > +#include <sys/mman.h>
> >> >> > +#include <errno.h>
> >> >> > +#include <time.h>
> >> >> > +
> >> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> >> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> >> >> > +#define ALIGNMENT_MTHP (64 * 1024)
> >> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> >> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> >> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> >> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
> >> >> > +
> >> >> > +#define SWPOUT_PATH \
> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> >> >> > +#define SWPOUT_FALLBACK_PATH \
> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> >> >> > +
> >> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
> >> >> > +{
> >> >> > +     void *mem = NULL;
> >> >> > +
> >> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
> >> >> > +             perror("posix_memalign");
> >> >> > +             return NULL;
> >> >> > +     }
> >> >> > +     return mem;
> >> >> > +}
> >> >> > +
> >> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
> >> >> > +             size_t align_size, size_t total_dontneed_size)
> >> >> > +{
> >> >> > +     size_t num_pages = total_dontneed_size / align_size;
> >> >> > +     size_t i;
> >> >> > +     size_t offset;
> >> >> > +     void *addr;
> >> >> > +
> >> >> > +     for (i = 0; i < num_pages; ++i) {
> >> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
> >> >> > +             addr = (char *)mem + offset;
> >> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> >> >> > +                     perror("madvise dontneed");
> >> >>
> >> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
> >> >> simulate the effect of large size swap-in when it's not available in
> >> >> kernel.  If we have large size swap-in in kernel in the future, this
> >> >> becomes unnecessary.
> >> >>
> >> >> Additionally, we have not reached the consensus that we should always
> >> >> swap-in with swapped-out size.  So, I suspect that this test may not
> >> >> reflect real situation in the future.  Although it doesn't reflect
> >> >> current situation too.
> >> >
> >> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
> >> > the best-case scenario, if we fail, it raises concerns for handling potentially
> >> > more challenging situations.
> >>
> >> Repeating sequential anonymous pages writing is the best case.
> >
> > I define the best case as the scenario with the least chance of creating
> > fragments within swapfiles for mTHP to swap out. There is no real
> > difference whether this is done through swapin or madv_dontneed.
>
> IMO, swapin is much more important than madv_dontneed.  Because most
> users use swapin automatically, but few use madv_dontneed by hand.  So,
> I think swapin/swapout test is much more important than madv_dontneed.
> I don't like this test case because madv_dontneed isn't typical or
> basic.

Disliking DONTNEED isn't a sufficient reason to reject this test program because
no single small program can report swapout counters, swapout fallback counters,
and fallback ratios within several minutes for 100 iterations. That's
precisely why
we need it, at least initially. We can enhance it further if it lacks
certain functionalities
that people desire.

The entire purpose of MADV_DONTNEED is to simulate a scenario where all
slots are released as a whole, preventing the creation of fragments, which is
most favorable for swap allocation. I believe there is no difference between
using MADV_DONTNEED or swapin for this purpose. But I am perfectly fine
with switching to swapin to replace MADV_DONTNEED in v2.

I will simply replace DONTNEED by swapping in all 16 subpages every time as
the initial commit, as I anticipate that this approach will yield the
best test results.

I anticipate that the optimization process will comprise three steps in total.

1. If our swapin process doesn't generate fragments(always swapin all subpages),
we achieve a 0% fallback ratio with Chris's and Ryan's current optimizations.

2. With the current optimizations from Chris and Ryan, we achieve a
fallback ratio
of less than 50% when generating fragments during swapping in by randomly
swapping in a portion of subpages.

The positive outcome is that we tested Ryan's V1 on an actual phone that swaps
in by small folios at 50% percentage (because we have 50% chance to fallback
while allocating mTHP within do_swap_page()). Despite this, we still achieved
a 0% fallback ratio when using two zRAMs: one for small folios and the other for
large folios. My assumption is that anonymous memory still maintains
good spatial
locality, allowing all subpages to eventually be accessed even though they are
swapped in one by one. So finally fragments are removed sooner or later.

3. We still maintain a 0% fallback ratio with Chris's long-term plan to optimize
   swapout, even using non-discontiguous slots. I actually don't find
it difficult
   if we can save a swap offset in subpage's field. But obviously
people don't like
   this because the trend is to remove subpage's stuff as much as possible :-)

>
> >>
> >> > I don't find it hard to incorporate additional features into this test
> >> > program to simulate more intricate scenarios.
> >>
> >> IMHO, we don't really need this special purpose test.  We can have some
> >> more general basic tests, for example, sequential anonymous pages
> >> writing/reading, random anonymous pages writing/reading, and combination
> >> of them.
> >
> > I understand that not all things will be loved by all people. However, before
> > I sent this patch, Chris mentioned that it has been very helpful for him and
> > strongly suggested that I contribute it to the self-test suite.
> >
> > By the way, adding sequential and random anonymous pages for
> > read/write operations is definitely in my plan. The absence of this feature
> > isn't a convincing reason to disregard it.
> >
>
> [snip]
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Huang, Ying June 20, 2024, 7:59 a.m. UTC | #9
Barry Song <21cnbao@gmail.com> writes:

> On Thu, Jun 20, 2024 at 6:36 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >> >>
>> >> >> Barry Song <21cnbao@gmail.com> writes:
>> >> >>
>> >> >> > From: Barry Song <v-songbaohua@oppo.com>
>> >> >> >
>> >> >> > Both Ryan and Chris have been utilizing the small test program to aid
>> >> >> > in debugging and identifying issues with swap entry allocation. While
>> >> >> > a real or intricate workload might be more suitable for assessing the
>> >> >> > correctness and effectiveness of the swap allocation policy, a small
>> >> >> > test program presents a simpler means of understanding the problem and
>> >> >> > initially verifying the improvements being made.
>> >> >> >
>> >> >> > Let's endeavor to integrate it into the self-test suite. Although it
>> >> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >> >> > expand its capabilities to support multiple sizes and simulate more
>> >> >> > complex systems in the future as required.
>> >> >>
>> >> >> IIUC, this is a performance test program instead of functionality test
>> >> >> program.  Does it match the purpose of the kernel selftest?
>> >> >
>> >> > I have a differing perspective. I maintain that the functionality is
>> >> > not functioning
>> >> > as expected. Despite having all the necessary resources for allocation, failure
>> >> > persists, indicating a lack of functionality.
>> >>
>> >> Is there any user visual functionality issue?
>> >
>> > Definitely not. If a plane can't take off, taking a train and pretending
>> > there's no functionality issue isn't a solution.
>>
>> I always think that performance optimization is great work.  However, it
>> is not functionality work.
>>
>> > I have never assigned blame for any mistakes here. On the contrary,
>> > I have 100% appreciation for Ryan's work in at least initiating mTHP
>> > swapout w/o being split.
>> >
>> > It took countless experiments for humans to make airplanes commercially
>> > viable, but the person who created the first flying airplane remains the
>> > greatest. Similarly, Ryan's efforts, combined with your review of his patch,
>> > have enabled us to achieve a better goal here. Without your work, we can't
>> > get here at all.
>>
>> Thanks!
>>
>> > However, this is never a reason to refuse to acknowledge that this feature
>> > is not actually working.
>>
>> It just works for some workloads, not for some others.
>>
>> >>
>> >> >>
>> >> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> >> >> > ---
>> >> >> >  tools/testing/selftests/mm/Makefile           |   1 +
>> >> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>> >> >> >  2 files changed, 193 insertions(+)
>> >> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> >
>> >> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
>> >> >> > index e1aa09ddaa3d..64164ad66835 100644
>> >> >> > --- a/tools/testing/selftests/mm/Makefile
>> >> >> > +++ b/tools/testing/selftests/mm/Makefile
>> >> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>> >> >> >  TEST_GEN_FILES += seal_elf
>> >> >> >  TEST_GEN_FILES += on-fault-limit
>> >> >> >  TEST_GEN_FILES += pagemap_ioctl
>> >> >> > +TEST_GEN_FILES += thp_swap_allocator_test
>> >> >> >  TEST_GEN_FILES += thuge-gen
>> >> >> >  TEST_GEN_FILES += transhuge-stress
>> >> >> >  TEST_GEN_FILES += uffd-stress
>> >> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> > new file mode 100644
>> >> >> > index 000000000000..4443a906d0f8
>> >> >> > --- /dev/null
>> >> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> > @@ -0,0 +1,192 @@
>> >> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
>> >> >> > +/*
>> >> >> > + * thp_swap_allocator_test
>> >> >> > + *
>> >> >> > + * The purpose of this test program is helping check if THP swpout
>> >> >> > + * can correctly get swap slots to swap out as a whole instead of
>> >> >> > + * being split. It randomly releases swap entries through madvise
>> >> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
>> >> >> > + * 64KB THP and the other area for small folios. The second memory
>> >> >> > + * can be enabled by "-s".
>> >> >> > + * Before running the program, we need to setup a zRAM or similar
>> >> >> > + * swap device by:
>> >> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
>> >> >> > + *  echo 64M > /sys/block/zram0/disksize
>> >> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>> >> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
>> >> >> > + *  mkswap /dev/zram0
>> >> >> > + *  swapon /dev/zram0
>> >> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
>> >> >> > + * w/o "-s".
>> >> >> > + *
>> >> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
>> >> >> > + */
>> >> >> > +
>> >> >> > +#define _GNU_SOURCE
>> >> >> > +#include <stdio.h>
>> >> >> > +#include <stdlib.h>
>> >> >> > +#include <unistd.h>
>> >> >> > +#include <string.h>
>> >> >> > +#include <sys/mman.h>
>> >> >> > +#include <errno.h>
>> >> >> > +#include <time.h>
>> >> >> > +
>> >> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
>> >> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
>> >> >> > +#define ALIGNMENT_MTHP (64 * 1024)
>> >> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
>> >> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
>> >> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
>> >> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
>> >> >> > +
>> >> >> > +#define SWPOUT_PATH \
>> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
>> >> >> > +#define SWPOUT_FALLBACK_PATH \
>> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
>> >> >> > +
>> >> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
>> >> >> > +{
>> >> >> > +     void *mem = NULL;
>> >> >> > +
>> >> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
>> >> >> > +             perror("posix_memalign");
>> >> >> > +             return NULL;
>> >> >> > +     }
>> >> >> > +     return mem;
>> >> >> > +}
>> >> >> > +
>> >> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
>> >> >> > +             size_t align_size, size_t total_dontneed_size)
>> >> >> > +{
>> >> >> > +     size_t num_pages = total_dontneed_size / align_size;
>> >> >> > +     size_t i;
>> >> >> > +     size_t offset;
>> >> >> > +     void *addr;
>> >> >> > +
>> >> >> > +     for (i = 0; i < num_pages; ++i) {
>> >> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
>> >> >> > +             addr = (char *)mem + offset;
>> >> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
>> >> >> > +                     perror("madvise dontneed");
>> >> >>
>> >> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
>> >> >> simulate the effect of large size swap-in when it's not available in
>> >> >> kernel.  If we have large size swap-in in kernel in the future, this
>> >> >> becomes unnecessary.
>> >> >>
>> >> >> Additionally, we have not reached the consensus that we should always
>> >> >> swap-in with swapped-out size.  So, I suspect that this test may not
>> >> >> reflect real situation in the future.  Although it doesn't reflect
>> >> >> current situation too.
>> >> >
>> >> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
>> >> > the best-case scenario, if we fail, it raises concerns for handling potentially
>> >> > more challenging situations.
>> >>
>> >> Repeating sequential anonymous pages writing is the best case.
>> >
>> > I define the best case as the scenario with the least chance of creating
>> > fragments within swapfiles for mTHP to swap out. There is no real
>> > difference whether this is done through swapin or madv_dontneed.
>>
>> IMO, swapin is much more important than madv_dontneed.  Because most
>> users use swapin automatically, but few use madv_dontneed by hand.  So,
>> I think swapin/swapout test is much more important than madv_dontneed.
>> I don't like this test case because madv_dontneed isn't typical or
>> basic.
>
> Disliking DONTNEED isn't a sufficient reason to reject this test program because
> no single small program can report swapout counters, swapout fallback counters,
> and fallback ratios within several minutes for 100 iterations. That's
> precisely why
> we need it, at least initially. We can enhance it further if it lacks
> certain functionalities
> that people desire.
>
> The entire purpose of MADV_DONTNEED is to simulate a scenario where all
> slots are released as a whole, preventing the creation of fragments, which is
> most favorable for swap allocation. I believe there is no difference between
> using MADV_DONTNEED or swapin for this purpose. But I am perfectly fine
> with switching to swapin to replace MADV_DONTNEED in v2.

Great!  Thanks for doing this!

And even better, can we not make swap-in address aligned and size
aligned?  It's too unrealistic.  It's good to consider some level of
spatial locality, for example, swap-in random number of pages
sequentially at some random addresses.  That could be a good general
test program.  We can use it to evaluate further swap optimizations, for
example, to evaluate the memory wastage of some swap-in size policy.

And, we don't need PAGEOUT too, just use large virtual address space in
test programs.  We can trigger swapout in more common way.

[snip]

--
Best Regards,
Huang, Ying
Barry Song June 20, 2024, 8:11 a.m. UTC | #10
On Thu, Jun 20, 2024 at 8:01 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Thu, Jun 20, 2024 at 6:36 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >> >>
> >> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >> >>
> >> >> >> > From: Barry Song <v-songbaohua@oppo.com>
> >> >> >> >
> >> >> >> > Both Ryan and Chris have been utilizing the small test program to aid
> >> >> >> > in debugging and identifying issues with swap entry allocation. While
> >> >> >> > a real or intricate workload might be more suitable for assessing the
> >> >> >> > correctness and effectiveness of the swap allocation policy, a small
> >> >> >> > test program presents a simpler means of understanding the problem and
> >> >> >> > initially verifying the improvements being made.
> >> >> >> >
> >> >> >> > Let's endeavor to integrate it into the self-test suite. Although it
> >> >> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >> >> > expand its capabilities to support multiple sizes and simulate more
> >> >> >> > complex systems in the future as required.
> >> >> >>
> >> >> >> IIUC, this is a performance test program instead of functionality test
> >> >> >> program.  Does it match the purpose of the kernel selftest?
> >> >> >
> >> >> > I have a differing perspective. I maintain that the functionality is
> >> >> > not functioning
> >> >> > as expected. Despite having all the necessary resources for allocation, failure
> >> >> > persists, indicating a lack of functionality.
> >> >>
> >> >> Is there any user visual functionality issue?
> >> >
> >> > Definitely not. If a plane can't take off, taking a train and pretending
> >> > there's no functionality issue isn't a solution.
> >>
> >> I always think that performance optimization is great work.  However, it
> >> is not functionality work.
> >>
> >> > I have never assigned blame for any mistakes here. On the contrary,
> >> > I have 100% appreciation for Ryan's work in at least initiating mTHP
> >> > swapout w/o being split.
> >> >
> >> > It took countless experiments for humans to make airplanes commercially
> >> > viable, but the person who created the first flying airplane remains the
> >> > greatest. Similarly, Ryan's efforts, combined with your review of his patch,
> >> > have enabled us to achieve a better goal here. Without your work, we can't
> >> > get here at all.
> >>
> >> Thanks!
> >>
> >> > However, this is never a reason to refuse to acknowledge that this feature
> >> > is not actually working.
> >>
> >> It just works for some workloads, not for some others.
> >>
> >> >>
> >> >> >>
> >> >> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> >> >> > ---
> >> >> >> >  tools/testing/selftests/mm/Makefile           |   1 +
> >> >> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
> >> >> >> >  2 files changed, 193 insertions(+)
> >> >> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> >
> >> >> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> >> >> >> > index e1aa09ddaa3d..64164ad66835 100644
> >> >> >> > --- a/tools/testing/selftests/mm/Makefile
> >> >> >> > +++ b/tools/testing/selftests/mm/Makefile
> >> >> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
> >> >> >> >  TEST_GEN_FILES += seal_elf
> >> >> >> >  TEST_GEN_FILES += on-fault-limit
> >> >> >> >  TEST_GEN_FILES += pagemap_ioctl
> >> >> >> > +TEST_GEN_FILES += thp_swap_allocator_test
> >> >> >> >  TEST_GEN_FILES += thuge-gen
> >> >> >> >  TEST_GEN_FILES += transhuge-stress
> >> >> >> >  TEST_GEN_FILES += uffd-stress
> >> >> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> > new file mode 100644
> >> >> >> > index 000000000000..4443a906d0f8
> >> >> >> > --- /dev/null
> >> >> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> > @@ -0,0 +1,192 @@
> >> >> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
> >> >> >> > +/*
> >> >> >> > + * thp_swap_allocator_test
> >> >> >> > + *
> >> >> >> > + * The purpose of this test program is helping check if THP swpout
> >> >> >> > + * can correctly get swap slots to swap out as a whole instead of
> >> >> >> > + * being split. It randomly releases swap entries through madvise
> >> >> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
> >> >> >> > + * 64KB THP and the other area for small folios. The second memory
> >> >> >> > + * can be enabled by "-s".
> >> >> >> > + * Before running the program, we need to setup a zRAM or similar
> >> >> >> > + * swap device by:
> >> >> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
> >> >> >> > + *  echo 64M > /sys/block/zram0/disksize
> >> >> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >> >> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> >> >> >> > + *  mkswap /dev/zram0
> >> >> >> > + *  swapon /dev/zram0
> >> >> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
> >> >> >> > + * w/o "-s".
> >> >> >> > + *
> >> >> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
> >> >> >> > + */
> >> >> >> > +
> >> >> >> > +#define _GNU_SOURCE
> >> >> >> > +#include <stdio.h>
> >> >> >> > +#include <stdlib.h>
> >> >> >> > +#include <unistd.h>
> >> >> >> > +#include <string.h>
> >> >> >> > +#include <sys/mman.h>
> >> >> >> > +#include <errno.h>
> >> >> >> > +#include <time.h>
> >> >> >> > +
> >> >> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> >> >> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> >> >> >> > +#define ALIGNMENT_MTHP (64 * 1024)
> >> >> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> >> >> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> >> >> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> >> >> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
> >> >> >> > +
> >> >> >> > +#define SWPOUT_PATH \
> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> >> >> >> > +#define SWPOUT_FALLBACK_PATH \
> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> >> >> >> > +
> >> >> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
> >> >> >> > +{
> >> >> >> > +     void *mem = NULL;
> >> >> >> > +
> >> >> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
> >> >> >> > +             perror("posix_memalign");
> >> >> >> > +             return NULL;
> >> >> >> > +     }
> >> >> >> > +     return mem;
> >> >> >> > +}
> >> >> >> > +
> >> >> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
> >> >> >> > +             size_t align_size, size_t total_dontneed_size)
> >> >> >> > +{
> >> >> >> > +     size_t num_pages = total_dontneed_size / align_size;
> >> >> >> > +     size_t i;
> >> >> >> > +     size_t offset;
> >> >> >> > +     void *addr;
> >> >> >> > +
> >> >> >> > +     for (i = 0; i < num_pages; ++i) {
> >> >> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
> >> >> >> > +             addr = (char *)mem + offset;
> >> >> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> >> >> >> > +                     perror("madvise dontneed");
> >> >> >>
> >> >> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
> >> >> >> simulate the effect of large size swap-in when it's not available in
> >> >> >> kernel.  If we have large size swap-in in kernel in the future, this
> >> >> >> becomes unnecessary.
> >> >> >>
> >> >> >> Additionally, we have not reached the consensus that we should always
> >> >> >> swap-in with swapped-out size.  So, I suspect that this test may not
> >> >> >> reflect real situation in the future.  Although it doesn't reflect
> >> >> >> current situation too.
> >> >> >
> >> >> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
> >> >> > the best-case scenario, if we fail, it raises concerns for handling potentially
> >> >> > more challenging situations.
> >> >>
> >> >> Repeating sequential anonymous pages writing is the best case.
> >> >
> >> > I define the best case as the scenario with the least chance of creating
> >> > fragments within swapfiles for mTHP to swap out. There is no real
> >> > difference whether this is done through swapin or madv_dontneed.
> >>
> >> IMO, swapin is much more important than madv_dontneed.  Because most
> >> users use swapin automatically, but few use madv_dontneed by hand.  So,
> >> I think swapin/swapout test is much more important than madv_dontneed.
> >> I don't like this test case because madv_dontneed isn't typical or
> >> basic.
> >
> > Disliking DONTNEED isn't a sufficient reason to reject this test program because
> > no single small program can report swapout counters, swapout fallback counters,
> > and fallback ratios within several minutes for 100 iterations. That's
> > precisely why
> > we need it, at least initially. We can enhance it further if it lacks
> > certain functionalities
> > that people desire.
> >
> > The entire purpose of MADV_DONTNEED is to simulate a scenario where all
> > slots are released as a whole, preventing the creation of fragments, which is
> > most favorable for swap allocation. I believe there is no difference between
> > using MADV_DONTNEED or swapin for this purpose. But I am perfectly fine
> > with switching to swapin to replace MADV_DONTNEED in v2.
>
> Great!  Thanks for doing this!
>
> And even better, can we not make swap-in address aligned and size
> aligned?  It's too unrealistic.  It's good to consider some level of
> spatial locality, for example, swap-in random number of pages
> sequentially at some random addresses.  That could be a good general
> test program.  We can use it to evaluate further swap optimizations, for
> example, to evaluate the memory wastage of some swap-in size policy.

I wholeheartedly agree with everything mentioned above; these are
actually part of my plan as incremental patches. This initial commit
serves as the first step of the three I proposed in the last email.

>
> And, we don't need PAGEOUT too, just use large virtual address space in
> test programs.  We can trigger swapout in more common way.

I'm not particularly enthusiastic about this idea, as I expect the test program
to run quickly. A large virtual address space would result in long waiting times
for the test results, as it relies on vmscan. Therefore, I hope we can use real
workloads to achieve this instead.

>
> [snip]
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Huang, Ying June 20, 2024, 8:26 a.m. UTC | #11
Barry Song <21cnbao@gmail.com> writes:

> On Thu, Jun 20, 2024 at 8:01 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Thu, Jun 20, 2024 at 6:36 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> > On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >> >>
>> >> >> Barry Song <21cnbao@gmail.com> writes:
>> >> >>
>> >> >> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >> >> >>
>> >> >> >> Barry Song <21cnbao@gmail.com> writes:
>> >> >> >>
>> >> >> >> > From: Barry Song <v-songbaohua@oppo.com>
>> >> >> >> >
>> >> >> >> > Both Ryan and Chris have been utilizing the small test program to aid
>> >> >> >> > in debugging and identifying issues with swap entry allocation. While
>> >> >> >> > a real or intricate workload might be more suitable for assessing the
>> >> >> >> > correctness and effectiveness of the swap allocation policy, a small
>> >> >> >> > test program presents a simpler means of understanding the problem and
>> >> >> >> > initially verifying the improvements being made.
>> >> >> >> >
>> >> >> >> > Let's endeavor to integrate it into the self-test suite. Although it
>> >> >> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >> >> >> > expand its capabilities to support multiple sizes and simulate more
>> >> >> >> > complex systems in the future as required.
>> >> >> >>
>> >> >> >> IIUC, this is a performance test program instead of functionality test
>> >> >> >> program.  Does it match the purpose of the kernel selftest?
>> >> >> >
>> >> >> > I have a differing perspective. I maintain that the functionality is
>> >> >> > not functioning
>> >> >> > as expected. Despite having all the necessary resources for allocation, failure
>> >> >> > persists, indicating a lack of functionality.
>> >> >>
>> >> >> Is there any user visual functionality issue?
>> >> >
>> >> > Definitely not. If a plane can't take off, taking a train and pretending
>> >> > there's no functionality issue isn't a solution.
>> >>
>> >> I always think that performance optimization is great work.  However, it
>> >> is not functionality work.
>> >>
>> >> > I have never assigned blame for any mistakes here. On the contrary,
>> >> > I have 100% appreciation for Ryan's work in at least initiating mTHP
>> >> > swapout w/o being split.
>> >> >
>> >> > It took countless experiments for humans to make airplanes commercially
>> >> > viable, but the person who created the first flying airplane remains the
>> >> > greatest. Similarly, Ryan's efforts, combined with your review of his patch,
>> >> > have enabled us to achieve a better goal here. Without your work, we can't
>> >> > get here at all.
>> >>
>> >> Thanks!
>> >>
>> >> > However, this is never a reason to refuse to acknowledge that this feature
>> >> > is not actually working.
>> >>
>> >> It just works for some workloads, not for some others.
>> >>
>> >> >>
>> >> >> >>
>> >> >> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> >> >> >> > ---
>> >> >> >> >  tools/testing/selftests/mm/Makefile           |   1 +
>> >> >> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>> >> >> >> >  2 files changed, 193 insertions(+)
>> >> >> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> >> >
>> >> >> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
>> >> >> >> > index e1aa09ddaa3d..64164ad66835 100644
>> >> >> >> > --- a/tools/testing/selftests/mm/Makefile
>> >> >> >> > +++ b/tools/testing/selftests/mm/Makefile
>> >> >> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>> >> >> >> >  TEST_GEN_FILES += seal_elf
>> >> >> >> >  TEST_GEN_FILES += on-fault-limit
>> >> >> >> >  TEST_GEN_FILES += pagemap_ioctl
>> >> >> >> > +TEST_GEN_FILES += thp_swap_allocator_test
>> >> >> >> >  TEST_GEN_FILES += thuge-gen
>> >> >> >> >  TEST_GEN_FILES += transhuge-stress
>> >> >> >> >  TEST_GEN_FILES += uffd-stress
>> >> >> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> >> > new file mode 100644
>> >> >> >> > index 000000000000..4443a906d0f8
>> >> >> >> > --- /dev/null
>> >> >> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
>> >> >> >> > @@ -0,0 +1,192 @@
>> >> >> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
>> >> >> >> > +/*
>> >> >> >> > + * thp_swap_allocator_test
>> >> >> >> > + *
>> >> >> >> > + * The purpose of this test program is helping check if THP swpout
>> >> >> >> > + * can correctly get swap slots to swap out as a whole instead of
>> >> >> >> > + * being split. It randomly releases swap entries through madvise
>> >> >> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
>> >> >> >> > + * 64KB THP and the other area for small folios. The second memory
>> >> >> >> > + * can be enabled by "-s".
>> >> >> >> > + * Before running the program, we need to setup a zRAM or similar
>> >> >> >> > + * swap device by:
>> >> >> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
>> >> >> >> > + *  echo 64M > /sys/block/zram0/disksize
>> >> >> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>> >> >> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
>> >> >> >> > + *  mkswap /dev/zram0
>> >> >> >> > + *  swapon /dev/zram0
>> >> >> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
>> >> >> >> > + * w/o "-s".
>> >> >> >> > + *
>> >> >> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
>> >> >> >> > + */
>> >> >> >> > +
>> >> >> >> > +#define _GNU_SOURCE
>> >> >> >> > +#include <stdio.h>
>> >> >> >> > +#include <stdlib.h>
>> >> >> >> > +#include <unistd.h>
>> >> >> >> > +#include <string.h>
>> >> >> >> > +#include <sys/mman.h>
>> >> >> >> > +#include <errno.h>
>> >> >> >> > +#include <time.h>
>> >> >> >> > +
>> >> >> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
>> >> >> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
>> >> >> >> > +#define ALIGNMENT_MTHP (64 * 1024)
>> >> >> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
>> >> >> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
>> >> >> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
>> >> >> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
>> >> >> >> > +
>> >> >> >> > +#define SWPOUT_PATH \
>> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
>> >> >> >> > +#define SWPOUT_FALLBACK_PATH \
>> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
>> >> >> >> > +
>> >> >> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
>> >> >> >> > +{
>> >> >> >> > +     void *mem = NULL;
>> >> >> >> > +
>> >> >> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
>> >> >> >> > +             perror("posix_memalign");
>> >> >> >> > +             return NULL;
>> >> >> >> > +     }
>> >> >> >> > +     return mem;
>> >> >> >> > +}
>> >> >> >> > +
>> >> >> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
>> >> >> >> > +             size_t align_size, size_t total_dontneed_size)
>> >> >> >> > +{
>> >> >> >> > +     size_t num_pages = total_dontneed_size / align_size;
>> >> >> >> > +     size_t i;
>> >> >> >> > +     size_t offset;
>> >> >> >> > +     void *addr;
>> >> >> >> > +
>> >> >> >> > +     for (i = 0; i < num_pages; ++i) {
>> >> >> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
>> >> >> >> > +             addr = (char *)mem + offset;
>> >> >> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
>> >> >> >> > +                     perror("madvise dontneed");
>> >> >> >>
>> >> >> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
>> >> >> >> simulate the effect of large size swap-in when it's not available in
>> >> >> >> kernel.  If we have large size swap-in in kernel in the future, this
>> >> >> >> becomes unnecessary.
>> >> >> >>
>> >> >> >> Additionally, we have not reached the consensus that we should always
>> >> >> >> swap-in with swapped-out size.  So, I suspect that this test may not
>> >> >> >> reflect real situation in the future.  Although it doesn't reflect
>> >> >> >> current situation too.
>> >> >> >
>> >> >> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
>> >> >> > the best-case scenario, if we fail, it raises concerns for handling potentially
>> >> >> > more challenging situations.
>> >> >>
>> >> >> Repeating sequential anonymous pages writing is the best case.
>> >> >
>> >> > I define the best case as the scenario with the least chance of creating
>> >> > fragments within swapfiles for mTHP to swap out. There is no real
>> >> > difference whether this is done through swapin or madv_dontneed.
>> >>
>> >> IMO, swapin is much more important than madv_dontneed.  Because most
>> >> users use swapin automatically, but few use madv_dontneed by hand.  So,
>> >> I think swapin/swapout test is much more important than madv_dontneed.
>> >> I don't like this test case because madv_dontneed isn't typical or
>> >> basic.
>> >
>> > Disliking DONTNEED isn't a sufficient reason to reject this test program because
>> > no single small program can report swapout counters, swapout fallback counters,
>> > and fallback ratios within several minutes for 100 iterations. That's
>> > precisely why
>> > we need it, at least initially. We can enhance it further if it lacks
>> > certain functionalities
>> > that people desire.
>> >
>> > The entire purpose of MADV_DONTNEED is to simulate a scenario where all
>> > slots are released as a whole, preventing the creation of fragments, which is
>> > most favorable for swap allocation. I believe there is no difference between
>> > using MADV_DONTNEED or swapin for this purpose. But I am perfectly fine
>> > with switching to swapin to replace MADV_DONTNEED in v2.
>>
>> Great!  Thanks for doing this!
>>
>> And even better, can we not make swap-in address aligned and size
>> aligned?  It's too unrealistic.  It's good to consider some level of
>> spatial locality, for example, swap-in random number of pages
>> sequentially at some random addresses.  That could be a good general
>> test program.  We can use it to evaluate further swap optimizations, for
>> example, to evaluate the memory wastage of some swap-in size policy.
>
> I wholeheartedly agree with everything mentioned above; these are
> actually part of my plan as incremental patches. This initial commit
> serves as the first step of the three I proposed in the last email.

It will be a small test program to implement all these.  Don't need to
use 3 steps.  IMHO, it's not good to optimize for a unrealistic test
case with address aligned and size aligned swap-in.  It's trivial to
remove the alignment requirements.

>> And, we don't need PAGEOUT too, just use large virtual address space in
>> test programs.  We can trigger swapout in more common way.
>
> I'm not particularly enthusiastic about this idea, as I expect the test program
> to run quickly. A large virtual address space would result in long waiting times
> for the test results, as it relies on vmscan. Therefore, I hope we can use real
> workloads to achieve this instead.

I have use test program with large virtual address space (in
vm-scalability) to do swap test before.  It runs really fast.  Please
give it a try.

--
Best Regards,
Huang, Ying
Ryan Roberts June 20, 2024, 9:04 a.m. UTC | #12
On 20/06/2024 01:26, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Both Ryan and Chris have been utilizing the small test program to aid
> in debugging and identifying issues with swap entry allocation. While
> a real or intricate workload might be more suitable for assessing the
> correctness and effectiveness of the swap allocation policy, a small
> test program presents a simpler means of understanding the problem and
> initially verifying the improvements being made.
> 
> Let's endeavor to integrate it into the self-test suite. Although it
> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> expand its capabilities to support multiple sizes and simulate more
> complex systems in the future as required.

I'll try to summarize the thread with Huang Ying by suggesting this test program
is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
I've certainly found it useful and think it would be a valuable addition to the
tree.

That said, I'm not convinced it is a selftest; IMO a selftest should provide a
clear pass/fail result against some criteria and must be able to be run
automatically by (e.g.) a CI system.

For the former, currently the test only fails if some API call unexpectedly
fails. Perhaps we would need to decide on some threshold for acceptable fallback
rate and fail if be go above that? But it's not clear how to determine such an
appropriate threshold.

For the latter, the test should be added to run_vmtests.sh. Then it will be run
automatically as part of invocing the standard kselftest runner and the results
will be reported in the standard TAP format. But this test also requires a very
specific swap and mthp configuration so you would also need to automatically
configure and restore that (either in run_vmtests.sh or in the c file).

Given the probable difficulties with defining clear success criteria, perhaps it
is better to put it in tools/mm?

Thanks,
Ryan


> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>  2 files changed, 193 insertions(+)
>  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> 
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index e1aa09ddaa3d..64164ad66835 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>  TEST_GEN_FILES += seal_elf
>  TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += pagemap_ioctl
> +TEST_GEN_FILES += thp_swap_allocator_test
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += uffd-stress
> diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> new file mode 100644
> index 000000000000..4443a906d0f8
> --- /dev/null
> +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> @@ -0,0 +1,192 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * thp_swap_allocator_test
> + *
> + * The purpose of this test program is helping check if THP swpout
> + * can correctly get swap slots to swap out as a whole instead of
> + * being split. It randomly releases swap entries through madvise
> + * DONTNEED and do swapout on two memory areas: a memory area for
> + * 64KB THP and the other area for small folios. The second memory
> + * can be enabled by "-s".
> + * Before running the program, we need to setup a zRAM or similar
> + * swap device by:
> + *  echo lzo > /sys/block/zram0/comp_algorithm
> + *  echo 64M > /sys/block/zram0/disksize
> + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> + *  mkswap /dev/zram0
> + *  swapon /dev/zram0
> + * The expected result should be 0% anon swpout fallback ratio w/ or
> + * w/o "-s".
> + *
> + * Author(s): Barry Song <v-songbaohua@oppo.com>
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <errno.h>
> +#include <time.h>
> +
> +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> +#define ALIGNMENT_MTHP (64 * 1024)
> +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> +#define MTHP_FOLIO_SIZE (64 * 1024)
> +
> +#define SWPOUT_PATH \
> +	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> +#define SWPOUT_FALLBACK_PATH \
> +	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> +
> +static void *aligned_alloc_mem(size_t size, size_t alignment)
> +{
> +	void *mem = NULL;
> +
> +	if (posix_memalign(&mem, alignment, size) != 0) {
> +		perror("posix_memalign");
> +		return NULL;
> +	}
> +	return mem;
> +}
> +
> +static void random_madvise_dontneed(void *mem, size_t mem_size,
> +		size_t align_size, size_t total_dontneed_size)
> +{
> +	size_t num_pages = total_dontneed_size / align_size;
> +	size_t i;
> +	size_t offset;
> +	void *addr;
> +
> +	for (i = 0; i < num_pages; ++i) {
> +		offset = (rand() % (mem_size / align_size)) * align_size;
> +		addr = (char *)mem + offset;
> +		if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> +			perror("madvise dontneed");
> +
> +		memset(addr, 0x11, align_size);
> +	}
> +}
> +
> +static unsigned long read_stat(const char *path)
> +{
> +	FILE *file;
> +	unsigned long value;
> +
> +	file = fopen(path, "r");
> +	if (!file) {
> +		perror("fopen");
> +		return 0;
> +	}
> +
> +	if (fscanf(file, "%lu", &value) != 1) {
> +		perror("fscanf");
> +		fclose(file);
> +		return 0;
> +	}
> +
> +	fclose(file);
> +	return value;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	int use_small_folio = 0;
> +	int i;
> +	void *mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
> +	void *mem2 = NULL;
> +
> +	if (mem1 == NULL) {
> +		fprintf(stderr, "Failed to allocate 60MB memory\n");
> +		return EXIT_FAILURE;
> +	}
> +
> +	if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
> +		perror("madvise hugepage for mem1");
> +		free(mem1);
> +		return EXIT_FAILURE;
> +	}
> +
> +	for (i = 1; i < argc; ++i) {
> +		if (strcmp(argv[i], "-s") == 0)
> +			use_small_folio = 1;
> +	}
> +
> +	if (use_small_folio) {
> +		mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
> +		if (mem2 == NULL) {
> +			fprintf(stderr, "Failed to allocate 1MB memory\n");
> +			free(mem1);
> +			return EXIT_FAILURE;
> +		}
> +
> +		if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
> +			perror("madvise nohugepage for mem2");
> +			free(mem1);
> +			free(mem2);
> +			return EXIT_FAILURE;
> +		}
> +	}
> +
> +	for (i = 0; i < 100; ++i) {
> +		unsigned long initial_swpout;
> +		unsigned long initial_swpout_fallback;
> +		unsigned long final_swpout;
> +		unsigned long final_swpout_fallback;
> +		unsigned long swpout_inc;
> +		unsigned long swpout_fallback_inc;
> +		double fallback_percentage;
> +
> +		initial_swpout = read_stat(SWPOUT_PATH);
> +		initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
> +
> +		random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
> +				TOTAL_DONTNEED_MTHP);
> +
> +		if (use_small_folio) {
> +			random_madvise_dontneed(mem2, MEMSIZE_SMALLFOLIO,
> +					ALIGNMENT_SMALLFOLIO,
> +					TOTAL_DONTNEED_SMALLFOLIO);
> +		}
> +
> +		if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
> +			perror("madvise pageout for mem1");
> +			free(mem1);
> +			if (mem2 != NULL)
> +				free(mem2);
> +			return EXIT_FAILURE;
> +		}
> +
> +		if (use_small_folio) {
> +			if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
> +				perror("madvise pageout for mem2");
> +				free(mem1);
> +				free(mem2);
> +				return EXIT_FAILURE;
> +			}
> +		}
> +
> +		final_swpout = read_stat(SWPOUT_PATH);
> +		final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
> +
> +		swpout_inc = final_swpout - initial_swpout;
> +		swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
> +
> +		fallback_percentage = (double)swpout_fallback_inc /
> +			(swpout_fallback_inc + swpout_inc) * 100;
> +
> +		printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
> +				i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
> +	}
> +
> +	free(mem1);
> +	if (mem2 != NULL)
> +		free(mem2);
> +
> +	return EXIT_SUCCESS;
> +}
Barry Song June 20, 2024, 9:07 a.m. UTC | #13
On Thu, Jun 20, 2024 at 8:28 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Thu, Jun 20, 2024 at 8:01 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Thu, Jun 20, 2024 at 6:36 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> > On Thu, Jun 20, 2024 at 5:22 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >> >>
> >> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >> >>
> >> >> >> > On Thu, Jun 20, 2024 at 1:55 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >> >> >>
> >> >> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >> >> >>
> >> >> >> >> > From: Barry Song <v-songbaohua@oppo.com>
> >> >> >> >> >
> >> >> >> >> > Both Ryan and Chris have been utilizing the small test program to aid
> >> >> >> >> > in debugging and identifying issues with swap entry allocation. While
> >> >> >> >> > a real or intricate workload might be more suitable for assessing the
> >> >> >> >> > correctness and effectiveness of the swap allocation policy, a small
> >> >> >> >> > test program presents a simpler means of understanding the problem and
> >> >> >> >> > initially verifying the improvements being made.
> >> >> >> >> >
> >> >> >> >> > Let's endeavor to integrate it into the self-test suite. Although it
> >> >> >> >> > presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >> >> >> > expand its capabilities to support multiple sizes and simulate more
> >> >> >> >> > complex systems in the future as required.
> >> >> >> >>
> >> >> >> >> IIUC, this is a performance test program instead of functionality test
> >> >> >> >> program.  Does it match the purpose of the kernel selftest?
> >> >> >> >
> >> >> >> > I have a differing perspective. I maintain that the functionality is
> >> >> >> > not functioning
> >> >> >> > as expected. Despite having all the necessary resources for allocation, failure
> >> >> >> > persists, indicating a lack of functionality.
> >> >> >>
> >> >> >> Is there any user visual functionality issue?
> >> >> >
> >> >> > Definitely not. If a plane can't take off, taking a train and pretending
> >> >> > there's no functionality issue isn't a solution.
> >> >>
> >> >> I always think that performance optimization is great work.  However, it
> >> >> is not functionality work.
> >> >>
> >> >> > I have never assigned blame for any mistakes here. On the contrary,
> >> >> > I have 100% appreciation for Ryan's work in at least initiating mTHP
> >> >> > swapout w/o being split.
> >> >> >
> >> >> > It took countless experiments for humans to make airplanes commercially
> >> >> > viable, but the person who created the first flying airplane remains the
> >> >> > greatest. Similarly, Ryan's efforts, combined with your review of his patch,
> >> >> > have enabled us to achieve a better goal here. Without your work, we can't
> >> >> > get here at all.
> >> >>
> >> >> Thanks!
> >> >>
> >> >> > However, this is never a reason to refuse to acknowledge that this feature
> >> >> > is not actually working.
> >> >>
> >> >> It just works for some workloads, not for some others.
> >> >>
> >> >> >>
> >> >> >> >>
> >> >> >> >> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> >> >> >> > ---
> >> >> >> >> >  tools/testing/selftests/mm/Makefile           |   1 +
> >> >> >> >> >  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
> >> >> >> >> >  2 files changed, 193 insertions(+)
> >> >> >> >> >  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> >> >
> >> >> >> >> > diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> >> >> >> >> > index e1aa09ddaa3d..64164ad66835 100644
> >> >> >> >> > --- a/tools/testing/selftests/mm/Makefile
> >> >> >> >> > +++ b/tools/testing/selftests/mm/Makefile
> >> >> >> >> > @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
> >> >> >> >> >  TEST_GEN_FILES += seal_elf
> >> >> >> >> >  TEST_GEN_FILES += on-fault-limit
> >> >> >> >> >  TEST_GEN_FILES += pagemap_ioctl
> >> >> >> >> > +TEST_GEN_FILES += thp_swap_allocator_test
> >> >> >> >> >  TEST_GEN_FILES += thuge-gen
> >> >> >> >> >  TEST_GEN_FILES += transhuge-stress
> >> >> >> >> >  TEST_GEN_FILES += uffd-stress
> >> >> >> >> > diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> >> > new file mode 100644
> >> >> >> >> > index 000000000000..4443a906d0f8
> >> >> >> >> > --- /dev/null
> >> >> >> >> > +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> >> >> >> >> > @@ -0,0 +1,192 @@
> >> >> >> >> > +// SPDX-License-Identifier: GPL-2.0-or-later
> >> >> >> >> > +/*
> >> >> >> >> > + * thp_swap_allocator_test
> >> >> >> >> > + *
> >> >> >> >> > + * The purpose of this test program is helping check if THP swpout
> >> >> >> >> > + * can correctly get swap slots to swap out as a whole instead of
> >> >> >> >> > + * being split. It randomly releases swap entries through madvise
> >> >> >> >> > + * DONTNEED and do swapout on two memory areas: a memory area for
> >> >> >> >> > + * 64KB THP and the other area for small folios. The second memory
> >> >> >> >> > + * can be enabled by "-s".
> >> >> >> >> > + * Before running the program, we need to setup a zRAM or similar
> >> >> >> >> > + * swap device by:
> >> >> >> >> > + *  echo lzo > /sys/block/zram0/comp_algorithm
> >> >> >> >> > + *  echo 64M > /sys/block/zram0/disksize
> >> >> >> >> > + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >> >> >> >> > + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> >> >> >> >> > + *  mkswap /dev/zram0
> >> >> >> >> > + *  swapon /dev/zram0
> >> >> >> >> > + * The expected result should be 0% anon swpout fallback ratio w/ or
> >> >> >> >> > + * w/o "-s".
> >> >> >> >> > + *
> >> >> >> >> > + * Author(s): Barry Song <v-songbaohua@oppo.com>
> >> >> >> >> > + */
> >> >> >> >> > +
> >> >> >> >> > +#define _GNU_SOURCE
> >> >> >> >> > +#include <stdio.h>
> >> >> >> >> > +#include <stdlib.h>
> >> >> >> >> > +#include <unistd.h>
> >> >> >> >> > +#include <string.h>
> >> >> >> >> > +#include <sys/mman.h>
> >> >> >> >> > +#include <errno.h>
> >> >> >> >> > +#include <time.h>
> >> >> >> >> > +
> >> >> >> >> > +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> >> >> >> >> > +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> >> >> >> >> > +#define ALIGNMENT_MTHP (64 * 1024)
> >> >> >> >> > +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> >> >> >> >> > +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> >> >> >> >> > +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> >> >> >> >> > +#define MTHP_FOLIO_SIZE (64 * 1024)
> >> >> >> >> > +
> >> >> >> >> > +#define SWPOUT_PATH \
> >> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> >> >> >> >> > +#define SWPOUT_FALLBACK_PATH \
> >> >> >> >> > +     "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> >> >> >> >> > +
> >> >> >> >> > +static void *aligned_alloc_mem(size_t size, size_t alignment)
> >> >> >> >> > +{
> >> >> >> >> > +     void *mem = NULL;
> >> >> >> >> > +
> >> >> >> >> > +     if (posix_memalign(&mem, alignment, size) != 0) {
> >> >> >> >> > +             perror("posix_memalign");
> >> >> >> >> > +             return NULL;
> >> >> >> >> > +     }
> >> >> >> >> > +     return mem;
> >> >> >> >> > +}
> >> >> >> >> > +
> >> >> >> >> > +static void random_madvise_dontneed(void *mem, size_t mem_size,
> >> >> >> >> > +             size_t align_size, size_t total_dontneed_size)
> >> >> >> >> > +{
> >> >> >> >> > +     size_t num_pages = total_dontneed_size / align_size;
> >> >> >> >> > +     size_t i;
> >> >> >> >> > +     size_t offset;
> >> >> >> >> > +     void *addr;
> >> >> >> >> > +
> >> >> >> >> > +     for (i = 0; i < num_pages; ++i) {
> >> >> >> >> > +             offset = (rand() % (mem_size / align_size)) * align_size;
> >> >> >> >> > +             addr = (char *)mem + offset;
> >> >> >> >> > +             if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> >> >> >> >> > +                     perror("madvise dontneed");
> >> >> >> >>
> >> >> >> >> IIUC, this simulates align_size (generally 64KB) swap-in.  That is, it
> >> >> >> >> simulate the effect of large size swap-in when it's not available in
> >> >> >> >> kernel.  If we have large size swap-in in kernel in the future, this
> >> >> >> >> becomes unnecessary.
> >> >> >> >>
> >> >> >> >> Additionally, we have not reached the consensus that we should always
> >> >> >> >> swap-in with swapped-out size.  So, I suspect that this test may not
> >> >> >> >> reflect real situation in the future.  Although it doesn't reflect
> >> >> >> >> current situation too.
> >> >> >> >
> >> >> >> > Disagree again. releasing the whole mTHP swaps is the best case. Even in
> >> >> >> > the best-case scenario, if we fail, it raises concerns for handling potentially
> >> >> >> > more challenging situations.
> >> >> >>
> >> >> >> Repeating sequential anonymous pages writing is the best case.
> >> >> >
> >> >> > I define the best case as the scenario with the least chance of creating
> >> >> > fragments within swapfiles for mTHP to swap out. There is no real
> >> >> > difference whether this is done through swapin or madv_dontneed.
> >> >>
> >> >> IMO, swapin is much more important than madv_dontneed.  Because most
> >> >> users use swapin automatically, but few use madv_dontneed by hand.  So,
> >> >> I think swapin/swapout test is much more important than madv_dontneed.
> >> >> I don't like this test case because madv_dontneed isn't typical or
> >> >> basic.
> >> >
> >> > Disliking DONTNEED isn't a sufficient reason to reject this test program because
> >> > no single small program can report swapout counters, swapout fallback counters,
> >> > and fallback ratios within several minutes for 100 iterations. That's
> >> > precisely why
> >> > we need it, at least initially. We can enhance it further if it lacks
> >> > certain functionalities
> >> > that people desire.
> >> >
> >> > The entire purpose of MADV_DONTNEED is to simulate a scenario where all
> >> > slots are released as a whole, preventing the creation of fragments, which is
> >> > most favorable for swap allocation. I believe there is no difference between
> >> > using MADV_DONTNEED or swapin for this purpose. But I am perfectly fine
> >> > with switching to swapin to replace MADV_DONTNEED in v2.
> >>
> >> Great!  Thanks for doing this!
> >>
> >> And even better, can we not make swap-in address aligned and size
> >> aligned?  It's too unrealistic.  It's good to consider some level of
> >> spatial locality, for example, swap-in random number of pages
> >> sequentially at some random addresses.  That could be a good general
> >> test program.  We can use it to evaluate further swap optimizations, for
> >> example, to evaluate the memory wastage of some swap-in size policy.
> >
> > I wholeheartedly agree with everything mentioned above; these are
> > actually part of my plan as incremental patches. This initial commit
> > serves as the first step of the three I proposed in the last email.
>
> It will be a small test program to implement all these.  Don't need to
> use 3 steps.  IMHO, it's not good to optimize for a unrealistic test
> case with address aligned and size aligned swap-in.  It's trivial to
> remove the alignment requirements.

I'll preserve alignment as an option for Chris and Ryan to compare
aligned cases with unaligned ones, rather than removing the alignment
code altogether. Additionally, it can aid us in understanding the potential
outcomes when dealing with large folio swap-ins.

>
> >> And, we don't need PAGEOUT too, just use large virtual address space in
> >> test programs.  We can trigger swapout in more common way.
> >
> > I'm not particularly enthusiastic about this idea, as I expect the test program
> > to run quickly. A large virtual address space would result in long waiting times
> > for the test results, as it relies on vmscan. Therefore, I hope we can use real
> > workloads to achieve this instead.
>
> I have use test program with large virtual address space (in
> vm-scalability) to do swap test before.  It runs really fast.  Please
> give it a try.

I'm not so optimistic. Having worked on both server and embedded systems,
I understand the disparity in resources between rich server environments and
resource-constrained embedded systems.
I'm constantly frustrated by how slow my tests run and deploy.

>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
David Hildenbrand June 20, 2024, 11:34 a.m. UTC | #14
On 20.06.24 11:04, Ryan Roberts wrote:
> On 20/06/2024 01:26, Barry Song wrote:
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> Both Ryan and Chris have been utilizing the small test program to aid
>> in debugging and identifying issues with swap entry allocation. While
>> a real or intricate workload might be more suitable for assessing the
>> correctness and effectiveness of the swap allocation policy, a small
>> test program presents a simpler means of understanding the problem and
>> initially verifying the improvements being made.
>>
>> Let's endeavor to integrate it into the self-test suite. Although it
>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> expand its capabilities to support multiple sizes and simulate more
>> complex systems in the future as required.
> 
> I'll try to summarize the thread with Huang Ying by suggesting this test program
> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> I've certainly found it useful and think it would be a valuable addition to the
> tree.
> 
> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> clear pass/fail result against some criteria and must be able to be run
> automatically by (e.g.) a CI system.

Likely we should then consider moving other such performance-related 
thingies out of the selftests?
Chris Li June 20, 2024, 11:34 p.m. UTC | #15
Hi Barry,

Thanks for the wonderful test program.

I have also used other swap test programs as well. A lot of those
tests are harder to setup up and run.

This test is very quick and simple to run. It can test some hard to
hit corner cases for me.

I am able to reproduce the warning and the kernel oops with this test program.
So for me, I am using it as a functional test that my allocator did
not produce a crash.
In that regard, it definitely provides value as a function test.

Having a fall percentage output is fine, as long as we don't fail the
test based on performance number.

I am also fine with moving the test to under tools/mm etc. I see good
value to include the test in the tree one way or the other.


On Wed, Jun 19, 2024 at 5:27 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> Both Ryan and Chris have been utilizing the small test program to aid
> in debugging and identifying issues with swap entry allocation. While
> a real or intricate workload might be more suitable for assessing the
> correctness and effectiveness of the swap allocation policy, a small
> test program presents a simpler means of understanding the problem and
> initially verifying the improvements being made.
>
> Let's endeavor to integrate it into the self-test suite. Although it
> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> expand its capabilities to support multiple sizes and simulate more
> complex systems in the future as required.
>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  tools/testing/selftests/mm/Makefile           |   1 +
>  .../selftests/mm/thp_swap_allocator_test.c    | 192 ++++++++++++++++++
>  2 files changed, 193 insertions(+)

Assume we want to keep it as selftest.
You did not add your test in run_vmtests.sh.

You might need something like this:

--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -418,6 +418,14 @@ CATEGORY="thp" run_test ./khugepaged -s 2

 CATEGORY="thp" run_test ./transhuge-stress -d 20

+# config and swapon zram here.
+
+CATEGORY="thp" run_test ./thp_swap_allocator_test
+
+CATEGORY="thp" run_test ./thp_swap_allocator_test -s
+
+# swapoff zram here.
+
 # Try to create XFS if not provided
 if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then
     if test_selected "thp"; then


You can use the following XFS test as an example of how to setup the zram swap.
XFS uses file system mount, you use swapon.

Also you need to update the usage string in run_vmtests.sh.

BTW, here is how I invoke the test runs:

kselftest_override_timeout=500 make -C tools/testing/selftests
TARGETS=mm run_tests

The time out is not for this test, it is for some other test before
the thp_swap which exit run_vmtests.sh before hitting thp_swap. I am
running in a VM so it is slower than native machine.

>  create mode 100644 tools/testing/selftests/mm/thp_swap_allocator_test.c
>
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index e1aa09ddaa3d..64164ad66835 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -65,6 +65,7 @@ TEST_GEN_FILES += mseal_test
>  TEST_GEN_FILES += seal_elf
>  TEST_GEN_FILES += on-fault-limit
>  TEST_GEN_FILES += pagemap_ioctl
> +TEST_GEN_FILES += thp_swap_allocator_test
>  TEST_GEN_FILES += thuge-gen
>  TEST_GEN_FILES += transhuge-stress
>  TEST_GEN_FILES += uffd-stress
> diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> new file mode 100644
> index 000000000000..4443a906d0f8
> --- /dev/null
> +++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
> @@ -0,0 +1,192 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * thp_swap_allocator_test
> + *
> + * The purpose of this test program is helping check if THP swpout
> + * can correctly get swap slots to swap out as a whole instead of
> + * being split. It randomly releases swap entries through madvise
> + * DONTNEED and do swapout on two memory areas: a memory area for
> + * 64KB THP and the other area for small folios. The second memory
> + * can be enabled by "-s".
> + * Before running the program, we need to setup a zRAM or similar
> + * swap device by:
> + *  echo lzo > /sys/block/zram0/comp_algorithm
> + *  echo 64M > /sys/block/zram0/disksize
> + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
> + *  mkswap /dev/zram0
> + *  swapon /dev/zram0

This setup needs to go into run_vmtest.sh as well.

Also tear it down after the test.

Chris

> + * The expected result should be 0% anon swpout fallback ratio w/ or
> + * w/o "-s".
> + *
> + * Author(s): Barry Song <v-songbaohua@oppo.com>
> + */
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <errno.h>
> +#include <time.h>
> +
> +#define MEMSIZE_MTHP (60 * 1024 * 1024)
> +#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
> +#define ALIGNMENT_MTHP (64 * 1024)
> +#define ALIGNMENT_SMALLFOLIO (4 * 1024)
> +#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
> +#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
> +#define MTHP_FOLIO_SIZE (64 * 1024)
> +
> +#define SWPOUT_PATH \
> +       "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
> +#define SWPOUT_FALLBACK_PATH \
> +       "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
> +
> +static void *aligned_alloc_mem(size_t size, size_t alignment)
> +{
> +       void *mem = NULL;
> +
> +       if (posix_memalign(&mem, alignment, size) != 0) {
> +               perror("posix_memalign");
> +               return NULL;
> +       }
> +       return mem;
> +}
> +
> +static void random_madvise_dontneed(void *mem, size_t mem_size,
> +               size_t align_size, size_t total_dontneed_size)
> +{
> +       size_t num_pages = total_dontneed_size / align_size;
> +       size_t i;
> +       size_t offset;
> +       void *addr;
> +
> +       for (i = 0; i < num_pages; ++i) {
> +               offset = (rand() % (mem_size / align_size)) * align_size;
> +               addr = (char *)mem + offset;
> +               if (madvise(addr, align_size, MADV_DONTNEED) != 0)
> +                       perror("madvise dontneed");
> +
> +               memset(addr, 0x11, align_size);
> +       }
> +}
> +
> +static unsigned long read_stat(const char *path)
> +{
> +       FILE *file;
> +       unsigned long value;
> +
> +       file = fopen(path, "r");
> +       if (!file) {
> +               perror("fopen");
> +               return 0;
> +       }
> +
> +       if (fscanf(file, "%lu", &value) != 1) {
> +               perror("fscanf");
> +               fclose(file);
> +               return 0;
> +       }
> +
> +       fclose(file);
> +       return value;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +       int use_small_folio = 0;
> +       int i;
> +       void *mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
> +       void *mem2 = NULL;
> +
> +       if (mem1 == NULL) {
> +               fprintf(stderr, "Failed to allocate 60MB memory\n");
> +               return EXIT_FAILURE;
> +       }
> +
> +       if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
> +               perror("madvise hugepage for mem1");
> +               free(mem1);
> +               return EXIT_FAILURE;
> +       }
> +
> +       for (i = 1; i < argc; ++i) {
> +               if (strcmp(argv[i], "-s") == 0)
> +                       use_small_folio = 1;
> +       }
> +
> +       if (use_small_folio) {
> +               mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
> +               if (mem2 == NULL) {
> +                       fprintf(stderr, "Failed to allocate 1MB memory\n");
> +                       free(mem1);
> +                       return EXIT_FAILURE;
> +               }
> +
> +               if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
> +                       perror("madvise nohugepage for mem2");
> +                       free(mem1);
> +                       free(mem2);
> +                       return EXIT_FAILURE;
> +               }
> +       }
> +
> +       for (i = 0; i < 100; ++i) {
> +               unsigned long initial_swpout;
> +               unsigned long initial_swpout_fallback;
> +               unsigned long final_swpout;
> +               unsigned long final_swpout_fallback;
> +               unsigned long swpout_inc;
> +               unsigned long swpout_fallback_inc;
> +               double fallback_percentage;
> +
> +               initial_swpout = read_stat(SWPOUT_PATH);
> +               initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
> +
> +               random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
> +                               TOTAL_DONTNEED_MTHP);
> +
> +               if (use_small_folio) {
> +                       random_madvise_dontneed(mem2, MEMSIZE_SMALLFOLIO,
> +                                       ALIGNMENT_SMALLFOLIO,
> +                                       TOTAL_DONTNEED_SMALLFOLIO);
> +               }
> +
> +               if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
> +                       perror("madvise pageout for mem1");
> +                       free(mem1);
> +                       if (mem2 != NULL)
> +                               free(mem2);
> +                       return EXIT_FAILURE;
> +               }
> +
> +               if (use_small_folio) {
> +                       if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
> +                               perror("madvise pageout for mem2");
> +                               free(mem1);
> +                               free(mem2);
> +                               return EXIT_FAILURE;
> +                       }
> +               }
> +
> +               final_swpout = read_stat(SWPOUT_PATH);
> +               final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
> +
> +               swpout_inc = final_swpout - initial_swpout;
> +               swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
> +
> +               fallback_percentage = (double)swpout_fallback_inc /
> +                       (swpout_fallback_inc + swpout_inc) * 100;
> +
> +               printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
> +                               i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);


Chris

> +       }
> +
> +       free(mem1);
> +       if (mem2 != NULL)
> +               free(mem2);
> +
> +       return EXIT_SUCCESS;
> +}
> --
> 2.34.1
>
>
Huang, Ying June 21, 2024, 2:33 a.m. UTC | #16
David Hildenbrand <david@redhat.com> writes:

> On 20.06.24 11:04, Ryan Roberts wrote:
>> On 20/06/2024 01:26, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> Both Ryan and Chris have been utilizing the small test program to aid
>>> in debugging and identifying issues with swap entry allocation. While
>>> a real or intricate workload might be more suitable for assessing the
>>> correctness and effectiveness of the swap allocation policy, a small
>>> test program presents a simpler means of understanding the problem and
>>> initially verifying the improvements being made.
>>>
>>> Let's endeavor to integrate it into the self-test suite. Although it
>>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>>> expand its capabilities to support multiple sizes and simulate more
>>> complex systems in the future as required.
>> I'll try to summarize the thread with Huang Ying by suggesting this
>> test program
>> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>> I've certainly found it useful and think it would be a valuable addition to the
>> tree.
>> That said, I'm not convinced it is a selftest; IMO a selftest should
>> provide a
>> clear pass/fail result against some criteria and must be able to be run
>> automatically by (e.g.) a CI system.
>
> Likely we should then consider moving other such performance-related
> thingies out of the selftests?

I think that it's good to distinguish between functionality and
performance tests.  For example, 0-day test system will use virtual
machines to do some functionality tests to improve efficiency.  But it's
not good to run performance tests in such kind of virtual machines.

--
Best Regards,
Huang, Ying
Ryan Roberts June 21, 2024, 7:25 a.m. UTC | #17
On 20/06/2024 12:34, David Hildenbrand wrote:
> On 20.06.24 11:04, Ryan Roberts wrote:
>> On 20/06/2024 01:26, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> Both Ryan and Chris have been utilizing the small test program to aid
>>> in debugging and identifying issues with swap entry allocation. While
>>> a real or intricate workload might be more suitable for assessing the
>>> correctness and effectiveness of the swap allocation policy, a small
>>> test program presents a simpler means of understanding the problem and
>>> initially verifying the improvements being made.
>>>
>>> Let's endeavor to integrate it into the self-test suite. Although it
>>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>>> expand its capabilities to support multiple sizes and simulate more
>>> complex systems in the future as required.
>>
>> I'll try to summarize the thread with Huang Ying by suggesting this test program
>> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>> I've certainly found it useful and think it would be a valuable addition to the
>> tree.
>>
>> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>> clear pass/fail result against some criteria and must be able to be run
>> automatically by (e.g.) a CI system.
> 
> Likely we should then consider moving other such performance-related thingies
> out of the selftests?

Yes, that would get my vote. But of the 4 tests you mentioned that use
clock_gettime(), it looks like transhuge-stress is the only one that doesn't
have a pass/fail result, so is probably the only candidate for moving.

The others either use the times as a timeout and determines failure if the
action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
supplemental performance information to an otherwise functionality-oriented test.
Ryan Roberts June 21, 2024, 7:34 a.m. UTC | #18
On 21/06/2024 00:34, Chris Li wrote:

>> + * thp_swap_allocator_test
>> + *
>> + * The purpose of this test program is helping check if THP swpout
>> + * can correctly get swap slots to swap out as a whole instead of
>> + * being split. It randomly releases swap entries through madvise
>> + * DONTNEED and do swapout on two memory areas: a memory area for
>> + * 64KB THP and the other area for small folios. The second memory
>> + * can be enabled by "-s".
>> + * Before running the program, we need to setup a zRAM or similar
>> + * swap device by:
>> + *  echo lzo > /sys/block/zram0/comp_algorithm
>> + *  echo 64M > /sys/block/zram0/disksize
>> + *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>> + *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
>> + *  mkswap /dev/zram0
>> + *  swapon /dev/zram0
> 
> This setup needs to go into run_vmtest.sh as well.
> 
> Also tear it down after the test.

Additionally, if keeping this as a selftest, you'll want to add

CONFIG_ZRAM=y

to tools/testing/selftests/mm/config so that automated systems ensure zram is
available in the kernel under test.

And you will need to ensure that the zram device has a higher priority than any
other already configured swap devices. Otherwise there will not even be an
attempt to use it for mTHP. The easy way is to do "swapoff -a" as the first step
but that makes cleanup tricky.
Barry Song June 21, 2024, 7:47 a.m. UTC | #19
On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 20/06/2024 12:34, David Hildenbrand wrote:
> > On 20.06.24 11:04, Ryan Roberts wrote:
> >> On 20/06/2024 01:26, Barry Song wrote:
> >>> From: Barry Song <v-songbaohua@oppo.com>
> >>>
> >>> Both Ryan and Chris have been utilizing the small test program to aid
> >>> in debugging and identifying issues with swap entry allocation. While
> >>> a real or intricate workload might be more suitable for assessing the
> >>> correctness and effectiveness of the swap allocation policy, a small
> >>> test program presents a simpler means of understanding the problem and
> >>> initially verifying the improvements being made.
> >>>
> >>> Let's endeavor to integrate it into the self-test suite. Although it
> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >>> expand its capabilities to support multiple sizes and simulate more
> >>> complex systems in the future as required.
> >>
> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> >> I've certainly found it useful and think it would be a valuable addition to the
> >> tree.
> >>
> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> >> clear pass/fail result against some criteria and must be able to be run
> >> automatically by (e.g.) a CI system.
> >
> > Likely we should then consider moving other such performance-related thingies
> > out of the selftests?
>
> Yes, that would get my vote. But of the 4 tests you mentioned that use
> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> have a pass/fail result, so is probably the only candidate for moving.
>
> The others either use the times as a timeout and determines failure if the
> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> supplemental performance information to an otherwise functionality-oriented test.

Thank you very much, Ryan. I think you've found a better home for this
tool . I will
send v2, relocating it to tools/mm and adding a function to swap in
either the whole
mTHPs or a portion of mTHPs by "-a"(aligned swapin).

So basically, we will have

1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
high exercise in a short time.

2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
new mTHP is always generated, released or swapped out, similar to the behavior
on a PC or Android phone where many applications are frequently started and
terminated.

3. Swap in with or without the "-a" option to observe how fragments
due to swap-in
and the incoming swap-in of large folios will impact swap-out fallback.

And many thanks to Chris for the suggestion on improving it within
selftest, though I
prefer to place it in tools/mm.

Thanks
Barry
Ryan Roberts June 21, 2024, 7:58 a.m. UTC | #20
On 21/06/2024 08:47, Barry Song wrote:
> On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 20/06/2024 12:34, David Hildenbrand wrote:
>>> On 20.06.24 11:04, Ryan Roberts wrote:
>>>> On 20/06/2024 01:26, Barry Song wrote:
>>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>>
>>>>> Both Ryan and Chris have been utilizing the small test program to aid
>>>>> in debugging and identifying issues with swap entry allocation. While
>>>>> a real or intricate workload might be more suitable for assessing the
>>>>> correctness and effectiveness of the swap allocation policy, a small
>>>>> test program presents a simpler means of understanding the problem and
>>>>> initially verifying the improvements being made.
>>>>>
>>>>> Let's endeavor to integrate it into the self-test suite. Although it
>>>>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>>>>> expand its capabilities to support multiple sizes and simulate more
>>>>> complex systems in the future as required.
>>>>
>>>> I'll try to summarize the thread with Huang Ying by suggesting this test program
>>>> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>>>> I've certainly found it useful and think it would be a valuable addition to the
>>>> tree.
>>>>
>>>> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>>>> clear pass/fail result against some criteria and must be able to be run
>>>> automatically by (e.g.) a CI system.
>>>
>>> Likely we should then consider moving other such performance-related thingies
>>> out of the selftests?
>>
>> Yes, that would get my vote. But of the 4 tests you mentioned that use
>> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
>> have a pass/fail result, so is probably the only candidate for moving.
>>
>> The others either use the times as a timeout and determines failure if the
>> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
>> supplemental performance information to an otherwise functionality-oriented test.
> 
> Thank you very much, Ryan. I think you've found a better home for this
> tool . I will
> send v2, relocating it to tools/mm and adding a function to swap in
> either the whole
> mTHPs or a portion of mTHPs by "-a"(aligned swapin).
> 
> So basically, we will have
> 
> 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> high exercise in a short time.
> 
> 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> new mTHP is always generated, released or swapped out, similar to the behavior
> on a PC or Android phone where many applications are frequently started and
> terminated.
> 
> 3. Swap in with or without the "-a" option to observe how fragments
> due to swap-in
> and the incoming swap-in of large folios will impact swap-out fallback.
> 
> And many thanks to Chris for the suggestion on improving it within
> selftest, though I
> prefer to place it in tools/mm.

All sounds good to me!

If, (for future) you also wanted to test the vmscan swap-out path, the way I've
been doing that is to run the workload in a memory-constrained cgroup. That
means you don't need to exhaust all your phsical ram so speeds things up a lot.


> 
> Thanks
> Barry
Chris Li June 21, 2024, 8:50 a.m. UTC | #21
On Fri, Jun 21, 2024 at 12:47 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >
> > On 20/06/2024 12:34, David Hildenbrand wrote:
> > > On 20.06.24 11:04, Ryan Roberts wrote:
> > >> On 20/06/2024 01:26, Barry Song wrote:
> > >>> From: Barry Song <v-songbaohua@oppo.com>
> > >>>
> > >>> Both Ryan and Chris have been utilizing the small test program to aid
> > >>> in debugging and identifying issues with swap entry allocation. While
> > >>> a real or intricate workload might be more suitable for assessing the
> > >>> correctness and effectiveness of the swap allocation policy, a small
> > >>> test program presents a simpler means of understanding the problem and
> > >>> initially verifying the improvements being made.
> > >>>
> > >>> Let's endeavor to integrate it into the self-test suite. Although it
> > >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> > >>> expand its capabilities to support multiple sizes and simulate more
> > >>> complex systems in the future as required.
> > >>
> > >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> > >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> > >> I've certainly found it useful and think it would be a valuable addition to the
> > >> tree.
> > >>
> > >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> > >> clear pass/fail result against some criteria and must be able to be run
> > >> automatically by (e.g.) a CI system.
> > >
> > > Likely we should then consider moving other such performance-related thingies
> > > out of the selftests?
> >
> > Yes, that would get my vote. But of the 4 tests you mentioned that use
> > clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> > have a pass/fail result, so is probably the only candidate for moving.
> >
> > The others either use the times as a timeout and determines failure if the
> > action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> > supplemental performance information to an otherwise functionality-oriented test.
>
> Thank you very much, Ryan. I think you've found a better home for this
> tool . I will
> send v2, relocating it to tools/mm and adding a function to swap in
> either the whole
> mTHPs or a portion of mTHPs by "-a"(aligned swapin).
>
> So basically, we will have
>
> 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> high exercise in a short time.
>
> 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> new mTHP is always generated, released or swapped out, similar to the behavior
> on a PC or Android phone where many applications are frequently started and
> terminated.

Will this cover the case that the ratio of order 0 and order 4 swap
requests change during LMK, and swapfile is almost full?

If not, please add that :-)

> 3. Swap in with or without the "-a" option to observe how fragments
> due to swap-in
> and the incoming swap-in of large folios will impact swap-out fallback.
>
> And many thanks to Chris for the suggestion on improving it within
> selftest, though I
> prefer to place it in tools/mm.

I am perfectly fine with that. Looking forward to your V2.

Chris
David Hildenbrand June 21, 2024, 8:52 a.m. UTC | #22
On 21.06.24 09:25, Ryan Roberts wrote:
> On 20/06/2024 12:34, David Hildenbrand wrote:
>> On 20.06.24 11:04, Ryan Roberts wrote:
>>> On 20/06/2024 01:26, Barry Song wrote:
>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>
>>>> Both Ryan and Chris have been utilizing the small test program to aid
>>>> in debugging and identifying issues with swap entry allocation. While
>>>> a real or intricate workload might be more suitable for assessing the
>>>> correctness and effectiveness of the swap allocation policy, a small
>>>> test program presents a simpler means of understanding the problem and
>>>> initially verifying the improvements being made.
>>>>
>>>> Let's endeavor to integrate it into the self-test suite. Although it
>>>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>>>> expand its capabilities to support multiple sizes and simulate more
>>>> complex systems in the future as required.
>>>
>>> I'll try to summarize the thread with Huang Ying by suggesting this test program
>>> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>>> I've certainly found it useful and think it would be a valuable addition to the
>>> tree.
>>>
>>> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>>> clear pass/fail result against some criteria and must be able to be run
>>> automatically by (e.g.) a CI system.
>>
>> Likely we should then consider moving other such performance-related thingies
>> out of the selftests?
> 
> Yes, that would get my vote. But of the 4 tests you mentioned that use
> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> have a pass/fail result, so is probably the only candidate for moving.
> 
> The others either use the times as a timeout and determines failure if the
> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> supplemental performance information to an otherwise functionality-oriented test.

Likely for ksm it would make sense to move the really functional parts 
to ksm_function_tests.c.

Fur gup_test it might be similar.
Huang, Ying June 21, 2024, 9:22 a.m. UTC | #23
Barry Song <21cnbao@gmail.com> writes:

> On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 20/06/2024 12:34, David Hildenbrand wrote:
>> > On 20.06.24 11:04, Ryan Roberts wrote:
>> >> On 20/06/2024 01:26, Barry Song wrote:
>> >>> From: Barry Song <v-songbaohua@oppo.com>
>> >>>
>> >>> Both Ryan and Chris have been utilizing the small test program to aid
>> >>> in debugging and identifying issues with swap entry allocation. While
>> >>> a real or intricate workload might be more suitable for assessing the
>> >>> correctness and effectiveness of the swap allocation policy, a small
>> >>> test program presents a simpler means of understanding the problem and
>> >>> initially verifying the improvements being made.
>> >>>
>> >>> Let's endeavor to integrate it into the self-test suite. Although it
>> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >>> expand its capabilities to support multiple sizes and simulate more
>> >>> complex systems in the future as required.
>> >>
>> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
>> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>> >> I've certainly found it useful and think it would be a valuable addition to the
>> >> tree.
>> >>
>> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>> >> clear pass/fail result against some criteria and must be able to be run
>> >> automatically by (e.g.) a CI system.
>> >
>> > Likely we should then consider moving other such performance-related thingies
>> > out of the selftests?
>>
>> Yes, that would get my vote. But of the 4 tests you mentioned that use
>> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
>> have a pass/fail result, so is probably the only candidate for moving.
>>
>> The others either use the times as a timeout and determines failure if the
>> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
>> supplemental performance information to an otherwise functionality-oriented test.
>
> Thank you very much, Ryan. I think you've found a better home for this
> tool . I will
> send v2, relocating it to tools/mm and adding a function to swap in
> either the whole
> mTHPs or a portion of mTHPs by "-a"(aligned swapin).
>
> So basically, we will have
>
> 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> high exercise in a short time.
>
> 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> new mTHP is always generated, released or swapped out, similar to the behavior
> on a PC or Android phone where many applications are frequently started and
> terminated.

MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
large folio swap-in exactly, which hasn't been merged by upstream.  I
don't think that it's a good idea to make such kind of trick.

> 3. Swap in with or without the "-a" option to observe how fragments
> due to swap-in
> and the incoming swap-in of large folios will impact swap-out fallback.

It's good to create fragmentation with swap-in.  Which is more practical
and future-proof.  And, I believe that we can reduce large folio
swap-out fallback rate without the large folio swap-in trick.

> And many thanks to Chris for the suggestion on improving it within
> selftest, though I
> prefer to place it in tools/mm.

--
Best Regards,
Huang, Ying
Barry Song June 21, 2024, 9:43 a.m. UTC | #24
On Fri, Jun 21, 2024 at 9:24 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 20/06/2024 12:34, David Hildenbrand wrote:
> >> > On 20.06.24 11:04, Ryan Roberts wrote:
> >> >> On 20/06/2024 01:26, Barry Song wrote:
> >> >>> From: Barry Song <v-songbaohua@oppo.com>
> >> >>>
> >> >>> Both Ryan and Chris have been utilizing the small test program to aid
> >> >>> in debugging and identifying issues with swap entry allocation. While
> >> >>> a real or intricate workload might be more suitable for assessing the
> >> >>> correctness and effectiveness of the swap allocation policy, a small
> >> >>> test program presents a simpler means of understanding the problem and
> >> >>> initially verifying the improvements being made.
> >> >>>
> >> >>> Let's endeavor to integrate it into the self-test suite. Although it
> >> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >>> expand its capabilities to support multiple sizes and simulate more
> >> >>> complex systems in the future as required.
> >> >>
> >> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> >> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> >> >> I've certainly found it useful and think it would be a valuable addition to the
> >> >> tree.
> >> >>
> >> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> >> >> clear pass/fail result against some criteria and must be able to be run
> >> >> automatically by (e.g.) a CI system.
> >> >
> >> > Likely we should then consider moving other such performance-related thingies
> >> > out of the selftests?
> >>
> >> Yes, that would get my vote. But of the 4 tests you mentioned that use
> >> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> >> have a pass/fail result, so is probably the only candidate for moving.
> >>
> >> The others either use the times as a timeout and determines failure if the
> >> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> >> supplemental performance information to an otherwise functionality-oriented test.
> >
> > Thank you very much, Ryan. I think you've found a better home for this
> > tool . I will
> > send v2, relocating it to tools/mm and adding a function to swap in
> > either the whole
> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
> >
> > So basically, we will have
> >
> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> > high exercise in a short time.
> >
> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> > new mTHP is always generated, released or swapped out, similar to the behavior
> > on a PC or Android phone where many applications are frequently started and
> > terminated.
>
> MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
> large folio swap-in exactly, which hasn't been merged by upstream.  I
> don't think that it's a good idea to make such kind of trick.

I disagree. This is how userspace heaps can manage memory deallocation.
Additionally, in the event of an application exit, munmap, or OOM killer, the
amount of freed memory can be much larger than 64KB. The primary purpose
of using MADV_DONTNEED is to release anonymous memory and generate
new mTHP so that the iteration can continue. Otherwise, the test program
becomes entirely pointless, as we only have large folios at the beginning.
That is exactly why Chris has failed to find his bugs by using other small
programs.

On the other hand, we definitely want large folios swap-in, otherwise, mTHP
is just a toy to Android or similar system where more than 2/3 memory could
be in swap. We do NOT want single-use mTHP.

>
> > 3. Swap in with or without the "-a" option to observe how fragments
> > due to swap-in
> > and the incoming swap-in of large folios will impact swap-out fallback.
>
> It's good to create fragmentation with swap-in.  Which is more practical
> and future-proof.  And, I believe that we can reduce large folio
> swap-out fallback rate without the large folio swap-in trick.
>
> > And many thanks to Chris for the suggestion on improving it within
> > selftest, though I
> > prefer to place it in tools/mm.
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Barry Song June 21, 2024, 11:20 a.m. UTC | #25
On Fri, Jun 21, 2024 at 4:50 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Jun 21, 2024 at 12:47 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> > >
> > > On 20/06/2024 12:34, David Hildenbrand wrote:
> > > > On 20.06.24 11:04, Ryan Roberts wrote:
> > > >> On 20/06/2024 01:26, Barry Song wrote:
> > > >>> From: Barry Song <v-songbaohua@oppo.com>
> > > >>>
> > > >>> Both Ryan and Chris have been utilizing the small test program to aid
> > > >>> in debugging and identifying issues with swap entry allocation. While
> > > >>> a real or intricate workload might be more suitable for assessing the
> > > >>> correctness and effectiveness of the swap allocation policy, a small
> > > >>> test program presents a simpler means of understanding the problem and
> > > >>> initially verifying the improvements being made.
> > > >>>
> > > >>> Let's endeavor to integrate it into the self-test suite. Although it
> > > >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> > > >>> expand its capabilities to support multiple sizes and simulate more
> > > >>> complex systems in the future as required.
> > > >>
> > > >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> > > >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> > > >> I've certainly found it useful and think it would be a valuable addition to the
> > > >> tree.
> > > >>
> > > >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> > > >> clear pass/fail result against some criteria and must be able to be run
> > > >> automatically by (e.g.) a CI system.
> > > >
> > > > Likely we should then consider moving other such performance-related thingies
> > > > out of the selftests?
> > >
> > > Yes, that would get my vote. But of the 4 tests you mentioned that use
> > > clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> > > have a pass/fail result, so is probably the only candidate for moving.
> > >
> > > The others either use the times as a timeout and determines failure if the
> > > action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> > > supplemental performance information to an otherwise functionality-oriented test.
> >
> > Thank you very much, Ryan. I think you've found a better home for this
> > tool . I will
> > send v2, relocating it to tools/mm and adding a function to swap in
> > either the whole
> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
> >
> > So basically, we will have
> >
> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> > high exercise in a short time.
> >
> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> > new mTHP is always generated, released or swapped out, similar to the behavior
> > on a PC or Android phone where many applications are frequently started and
> > terminated.
>
> Will this cover the case that the ratio of order 0 and order 4 swap
> requests change during LMK, and swapfile is almost full?
>
> If not, please add that :-)

Due to 2, we ensure a certain proportion of mTHP. Similarly, because
of 3, we maintain
a certain proportion of small folios, as we don't support large folios
swap-in, meaning
any swap-in will immediately result in small folios. Therefore, with
both 2 and 3, we
automatically achieve a system containing both mTHP and small folios.
Additionally,
1 provides the ability to continuously swap them out. If we set the
same sizes for 2
and 3, we'll achieve a 1:1 ratio of large folios to small folios. How
about starting with
a 1:1 ratio?

To meet the requirement that the swapfile is almost full, I can
increase the memory to
ensure the total size is quite close to zRAM. This way, we give the
small folios a chance
to perform a slow scan and observe the impact.

>
> > 3. Swap in with or without the "-a" option to observe how fragments
> > due to swap-in
> > and the incoming swap-in of large folios will impact swap-out fallback.
> >
> > And many thanks to Chris for the suggestion on improving it within
> > selftest, though I
> > prefer to place it in tools/mm.
>
> I am perfectly fine with that. Looking forward to your V2.
>
> Chris

Thanks
Barry
Huang, Ying June 24, 2024, 3:42 a.m. UTC | #26
Barry Song <21cnbao@gmail.com> writes:

> On Fri, Jun 21, 2024 at 9:24 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>> >>
>> >> On 20/06/2024 12:34, David Hildenbrand wrote:
>> >> > On 20.06.24 11:04, Ryan Roberts wrote:
>> >> >> On 20/06/2024 01:26, Barry Song wrote:
>> >> >>> From: Barry Song <v-songbaohua@oppo.com>
>> >> >>>
>> >> >>> Both Ryan and Chris have been utilizing the small test program to aid
>> >> >>> in debugging and identifying issues with swap entry allocation. While
>> >> >>> a real or intricate workload might be more suitable for assessing the
>> >> >>> correctness and effectiveness of the swap allocation policy, a small
>> >> >>> test program presents a simpler means of understanding the problem and
>> >> >>> initially verifying the improvements being made.
>> >> >>>
>> >> >>> Let's endeavor to integrate it into the self-test suite. Although it
>> >> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >> >>> expand its capabilities to support multiple sizes and simulate more
>> >> >>> complex systems in the future as required.
>> >> >>
>> >> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
>> >> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>> >> >> I've certainly found it useful and think it would be a valuable addition to the
>> >> >> tree.
>> >> >>
>> >> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>> >> >> clear pass/fail result against some criteria and must be able to be run
>> >> >> automatically by (e.g.) a CI system.
>> >> >
>> >> > Likely we should then consider moving other such performance-related thingies
>> >> > out of the selftests?
>> >>
>> >> Yes, that would get my vote. But of the 4 tests you mentioned that use
>> >> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
>> >> have a pass/fail result, so is probably the only candidate for moving.
>> >>
>> >> The others either use the times as a timeout and determines failure if the
>> >> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
>> >> supplemental performance information to an otherwise functionality-oriented test.
>> >
>> > Thank you very much, Ryan. I think you've found a better home for this
>> > tool . I will
>> > send v2, relocating it to tools/mm and adding a function to swap in
>> > either the whole
>> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
>> >
>> > So basically, we will have
>> >
>> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
>> > high exercise in a short time.
>> >
>> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
>> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
>> > new mTHP is always generated, released or swapped out, similar to the behavior
>> > on a PC or Android phone where many applications are frequently started and
>> > terminated.
>>
>> MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
>> large folio swap-in exactly, which hasn't been merged by upstream.  I
>> don't think that it's a good idea to make such kind of trick.
>
> I disagree. This is how userspace heaps can manage memory
> deallocation.

Sorry, I don't understand how.  Can you show some examples?  Such as
strace log with 64KB aligned MADV_DONTNEED?

> Additionally, in the event of an application exit, munmap, or OOM killer, the
> amount of freed memory can be much larger than 64KB. The primary purpose
> of using MADV_DONTNEED is to release anonymous memory and generate
> new mTHP so that the iteration can continue. Otherwise, the test program
> becomes entirely pointless, as we only have large folios at the beginning.
> That is exactly why Chris has failed to find his bugs by using other small
> programs.

Although I still don't understand how 64KB aligned MADV_DONTNEED is used
for libc/java heap or munmap in a practical way.  After more thoughts, I
think 64KB Aligned MADV_DONTNEED can simulate the fragmentation effect
of processes exit at some degree if 64KB folios in these processes are
swapped out without splitting.  If you have no other practical use
cases, I suggest to make it explicit with comments in program.

> On the other hand, we definitely want large folios swap-in, otherwise, mTHP
> is just a toy to Android or similar system where more than 2/3 memory could
> be in swap. We do NOT want single-use mTHP.

I agree that large folios swap-in has its value at least in some
situations.  Whether we should take it as default behavior is another
topic, we can discuss it further in the future.

>>
>> > 3. Swap in with or without the "-a" option to observe how fragments
>> > due to swap-in
>> > and the incoming swap-in of large folios will impact swap-out fallback.
>>
>> It's good to create fragmentation with swap-in.  Which is more practical
>> and future-proof.  And, I believe that we can reduce large folio
>> swap-out fallback rate without the large folio swap-in trick.
>>
>> > And many thanks to Chris for the suggestion on improving it within
>> > selftest, though I
>> > prefer to place it in tools/mm.

--
Best Regards,
Huang, Ying
Barry Song June 24, 2024, 4:05 a.m. UTC | #27
On Mon, Jun 24, 2024 at 3:44 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Fri, Jun 21, 2024 at 9:24 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >> >>
> >> >> On 20/06/2024 12:34, David Hildenbrand wrote:
> >> >> > On 20.06.24 11:04, Ryan Roberts wrote:
> >> >> >> On 20/06/2024 01:26, Barry Song wrote:
> >> >> >>> From: Barry Song <v-songbaohua@oppo.com>
> >> >> >>>
> >> >> >>> Both Ryan and Chris have been utilizing the small test program to aid
> >> >> >>> in debugging and identifying issues with swap entry allocation. While
> >> >> >>> a real or intricate workload might be more suitable for assessing the
> >> >> >>> correctness and effectiveness of the swap allocation policy, a small
> >> >> >>> test program presents a simpler means of understanding the problem and
> >> >> >>> initially verifying the improvements being made.
> >> >> >>>
> >> >> >>> Let's endeavor to integrate it into the self-test suite. Although it
> >> >> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >> >>> expand its capabilities to support multiple sizes and simulate more
> >> >> >>> complex systems in the future as required.
> >> >> >>
> >> >> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> >> >> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> >> >> >> I've certainly found it useful and think it would be a valuable addition to the
> >> >> >> tree.
> >> >> >>
> >> >> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> >> >> >> clear pass/fail result against some criteria and must be able to be run
> >> >> >> automatically by (e.g.) a CI system.
> >> >> >
> >> >> > Likely we should then consider moving other such performance-related thingies
> >> >> > out of the selftests?
> >> >>
> >> >> Yes, that would get my vote. But of the 4 tests you mentioned that use
> >> >> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> >> >> have a pass/fail result, so is probably the only candidate for moving.
> >> >>
> >> >> The others either use the times as a timeout and determines failure if the
> >> >> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> >> >> supplemental performance information to an otherwise functionality-oriented test.
> >> >
> >> > Thank you very much, Ryan. I think you've found a better home for this
> >> > tool . I will
> >> > send v2, relocating it to tools/mm and adding a function to swap in
> >> > either the whole
> >> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
> >> >
> >> > So basically, we will have
> >> >
> >> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> >> > high exercise in a short time.
> >> >
> >> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> >> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> >> > new mTHP is always generated, released or swapped out, similar to the behavior
> >> > on a PC or Android phone where many applications are frequently started and
> >> > terminated.
> >>
> >> MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
> >> large folio swap-in exactly, which hasn't been merged by upstream.  I
> >> don't think that it's a good idea to make such kind of trick.
> >
> > I disagree. This is how userspace heaps can manage memory
> > deallocation.
>
> Sorry, I don't understand how.  Can you show some examples?  Such as
> strace log with 64KB aligned MADV_DONTNEED?

In Java heap and memory allocators such as jemalloc and Scudo, memory is freed
using the MADV_DONTNEED flag when either free() is called or garbage collection
occurs. In Android, the Java heap is freed in chunks aligned to 64KB
or larger. In
Scudo and jemalloc, there is a configuration option to set the
management granularity.
This granularity is set to match the mTHP size(though the default
value is 16KB in the
latest Android if we don't run mTHP). Otherwise, you could end up with
millions of
partial unmap operations, which would severely degrade the performance of mTHP.

Imagine libc/Java functioning like a slab allocator. When kfree() is
called, some pages
may become completely unoccupied and can be returned to the buddy allocator. In
userspace, memory is given back to the kernel in a similar manner,
typically using
MADV_DONTNEED. Therefore, MADV_DONTNEED is the most common memory
reclamation behavior in Android, coming with free(), delete() or GC.

Imagine a system with extensive malloc, free, new, and delete
operations, where objects
are constantly being created and destroyed.

On the other hand, whether libc/Java use MADV_DONTNEED to free memory is not
crucial, although they do. We need a method to simulate the lifecycle
of applications
—exiting and starting anew—on PCs or Android phones. It doesn't matter if you
use MADV_DONTNEED or munmap to achieve this.

It is important to note that mTHP currently operates on a one-shot
basis(after swap-out,
you never get them back as mTHP as we don't support large folios
swapin). For the test
program, we need a method to generate new mTHPs continuously. Without this,
after the initial iterations, we would be left with only folios,
rendering the entire
test program *pointless*.

>
> > Additionally, in the event of an application exit, munmap, or OOM killer, the
> > amount of freed memory can be much larger than 64KB. The primary purpose
> > of using MADV_DONTNEED is to release anonymous memory and generate
> > new mTHP so that the iteration can continue. Otherwise, the test program
> > becomes entirely pointless, as we only have large folios at the beginning.
> > That is exactly why Chris has failed to find his bugs by using other small
> > programs.
>
> Although I still don't understand how 64KB aligned MADV_DONTNEED is used
> for libc/java heap or munmap in a practical way.  After more thoughts, I
> think 64KB Aligned MADV_DONTNEED can simulate the fragmentation effect
> of processes exit at some degree if 64KB folios in these processes are
> swapped out without splitting.  If you have no other practical use
> cases, I suggest to make it explicit with comments in program.
>
> > On the other hand, we definitely want large folios swap-in, otherwise, mTHP
> > is just a toy to Android or similar system where more than 2/3 memory could
> > be in swap. We do NOT want single-use mTHP.
>
> I agree that large folios swap-in has its value at least in some
> situations.  Whether we should take it as default behavior is another
> topic, we can discuss it further in the future.

Cool. Just imagine that mTHP is beneficial for systems that don't frequently
use swap. However, for Android, where most memory resides in swap, mTHP
acts like a one-way ticket: you end up with small folios and can't revert to
large ones. This is so BAD.

>
> >>
> >> > 3. Swap in with or without the "-a" option to observe how fragments
> >> > due to swap-in
> >> > and the incoming swap-in of large folios will impact swap-out fallback.
> >>
> >> It's good to create fragmentation with swap-in.  Which is more practical
> >> and future-proof.  And, I believe that we can reduce large folio
> >> swap-out fallback rate without the large folio swap-in trick.
> >>
> >> > And many thanks to Chris for the suggestion on improving it within
> >> > selftest, though I
> >> > prefer to place it in tools/mm.
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Huang, Ying June 24, 2024, 6:59 a.m. UTC | #28
Barry Song <21cnbao@gmail.com> writes:

> On Mon, Jun 24, 2024 at 3:44 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Fri, Jun 21, 2024 at 9:24 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>> >> >>
>> >> >> On 20/06/2024 12:34, David Hildenbrand wrote:
>> >> >> > On 20.06.24 11:04, Ryan Roberts wrote:
>> >> >> >> On 20/06/2024 01:26, Barry Song wrote:
>> >> >> >>> From: Barry Song <v-songbaohua@oppo.com>
>> >> >> >>>
>> >> >> >>> Both Ryan and Chris have been utilizing the small test program to aid
>> >> >> >>> in debugging and identifying issues with swap entry allocation. While
>> >> >> >>> a real or intricate workload might be more suitable for assessing the
>> >> >> >>> correctness and effectiveness of the swap allocation policy, a small
>> >> >> >>> test program presents a simpler means of understanding the problem and
>> >> >> >>> initially verifying the improvements being made.
>> >> >> >>>
>> >> >> >>> Let's endeavor to integrate it into the self-test suite. Although it
>> >> >> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
>> >> >> >>> expand its capabilities to support multiple sizes and simulate more
>> >> >> >>> complex systems in the future as required.
>> >> >> >>
>> >> >> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
>> >> >> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
>> >> >> >> I've certainly found it useful and think it would be a valuable addition to the
>> >> >> >> tree.
>> >> >> >>
>> >> >> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
>> >> >> >> clear pass/fail result against some criteria and must be able to be run
>> >> >> >> automatically by (e.g.) a CI system.
>> >> >> >
>> >> >> > Likely we should then consider moving other such performance-related thingies
>> >> >> > out of the selftests?
>> >> >>
>> >> >> Yes, that would get my vote. But of the 4 tests you mentioned that use
>> >> >> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
>> >> >> have a pass/fail result, so is probably the only candidate for moving.
>> >> >>
>> >> >> The others either use the times as a timeout and determines failure if the
>> >> >> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
>> >> >> supplemental performance information to an otherwise functionality-oriented test.
>> >> >
>> >> > Thank you very much, Ryan. I think you've found a better home for this
>> >> > tool . I will
>> >> > send v2, relocating it to tools/mm and adding a function to swap in
>> >> > either the whole
>> >> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
>> >> >
>> >> > So basically, we will have
>> >> >
>> >> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
>> >> > high exercise in a short time.
>> >> >
>> >> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
>> >> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
>> >> > new mTHP is always generated, released or swapped out, similar to the behavior
>> >> > on a PC or Android phone where many applications are frequently started and
>> >> > terminated.
>> >>
>> >> MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
>> >> large folio swap-in exactly, which hasn't been merged by upstream.  I
>> >> don't think that it's a good idea to make such kind of trick.
>> >
>> > I disagree. This is how userspace heaps can manage memory
>> > deallocation.
>>
>> Sorry, I don't understand how.  Can you show some examples?  Such as
>> strace log with 64KB aligned MADV_DONTNEED?
>
> In Java heap and memory allocators such as jemalloc and Scudo, memory is freed
> using the MADV_DONTNEED flag when either free() is called or garbage collection
> occurs. In Android, the Java heap is freed in chunks aligned to 64KB
> or larger.

Originally, I heard about that MADV_FREE is used by jemalloc.  Now, I
know that they use MADV_DONTNEED too.  Thanks!

Although I still suspect that libc/java allocator will free pages in
exact 64KB size (IIUC, they should free pages in much larger trunk).  I
agree that MADV_DONTNEED is a way to create fragmentation in swap
devices.

> In
> Scudo and jemalloc, there is a configuration option to set the
> management granularity.
> This granularity is set to match the mTHP size(though the default
> value is 16KB in the
> latest Android if we don't run mTHP). Otherwise, you could end up with
> millions of
> partial unmap operations, which would severely degrade the performance of mTHP.
>
> Imagine libc/Java functioning like a slab allocator. When kfree() is
> called, some pages
> may become completely unoccupied and can be returned to the buddy allocator. In
> userspace, memory is given back to the kernel in a similar manner,
> typically using
> MADV_DONTNEED. Therefore, MADV_DONTNEED is the most common memory
> reclamation behavior in Android, coming with free(), delete() or GC.
>
> Imagine a system with extensive malloc, free, new, and delete
> operations, where objects
> are constantly being created and destroyed.
>
> On the other hand, whether libc/Java use MADV_DONTNEED to free memory is not
> crucial, although they do. We need a method to simulate the lifecycle
> of applications
> —exiting and starting anew—on PCs or Android phones. It doesn't matter if you
> use MADV_DONTNEED or munmap to achieve this.
>
> It is important to note that mTHP currently operates on a one-shot
> basis(after swap-out,
> you never get them back as mTHP as we don't support large folios
> swapin). For the test
> program, we need a method to generate new mTHPs continuously. Without this,
> after the initial iterations, we would be left with only folios,
> rendering the entire
> test program *pointless*.

I understand the requirements for new mTHPs.

>>
>> > Additionally, in the event of an application exit, munmap, or OOM killer, the
>> > amount of freed memory can be much larger than 64KB. The primary purpose
>> > of using MADV_DONTNEED is to release anonymous memory and generate
>> > new mTHP so that the iteration can continue. Otherwise, the test program
>> > becomes entirely pointless, as we only have large folios at the beginning.
>> > That is exactly why Chris has failed to find his bugs by using other small
>> > programs.
>>
>> Although I still don't understand how 64KB aligned MADV_DONTNEED is used
>> for libc/java heap or munmap in a practical way.  After more thoughts, I
>> think 64KB Aligned MADV_DONTNEED can simulate the fragmentation effect
>> of processes exit at some degree if 64KB folios in these processes are
>> swapped out without splitting.  If you have no other practical use
>> cases, I suggest to make it explicit with comments in program.
>>

[snip]

--
Best Regards,
Huang, Ying
Barry Song June 24, 2024, 7:55 a.m. UTC | #29
On Mon, Jun 24, 2024 at 7:01 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Mon, Jun 24, 2024 at 3:44 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Fri, Jun 21, 2024 at 9:24 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> > On Fri, Jun 21, 2024 at 7:25 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >> >> >>
> >> >> >> On 20/06/2024 12:34, David Hildenbrand wrote:
> >> >> >> > On 20.06.24 11:04, Ryan Roberts wrote:
> >> >> >> >> On 20/06/2024 01:26, Barry Song wrote:
> >> >> >> >>> From: Barry Song <v-songbaohua@oppo.com>
> >> >> >> >>>
> >> >> >> >>> Both Ryan and Chris have been utilizing the small test program to aid
> >> >> >> >>> in debugging and identifying issues with swap entry allocation. While
> >> >> >> >>> a real or intricate workload might be more suitable for assessing the
> >> >> >> >>> correctness and effectiveness of the swap allocation policy, a small
> >> >> >> >>> test program presents a simpler means of understanding the problem and
> >> >> >> >>> initially verifying the improvements being made.
> >> >> >> >>>
> >> >> >> >>> Let's endeavor to integrate it into the self-test suite. Although it
> >> >> >> >>> presently only accommodates 64KB and 4KB, I'm optimistic that we can
> >> >> >> >>> expand its capabilities to support multiple sizes and simulate more
> >> >> >> >>> complex systems in the future as required.
> >> >> >> >>
> >> >> >> >> I'll try to summarize the thread with Huang Ying by suggesting this test program
> >> >> >> >> is "neccessary but not sufficient" to exhaustively test the mTHP swap-out path.
> >> >> >> >> I've certainly found it useful and think it would be a valuable addition to the
> >> >> >> >> tree.
> >> >> >> >>
> >> >> >> >> That said, I'm not convinced it is a selftest; IMO a selftest should provide a
> >> >> >> >> clear pass/fail result against some criteria and must be able to be run
> >> >> >> >> automatically by (e.g.) a CI system.
> >> >> >> >
> >> >> >> > Likely we should then consider moving other such performance-related thingies
> >> >> >> > out of the selftests?
> >> >> >>
> >> >> >> Yes, that would get my vote. But of the 4 tests you mentioned that use
> >> >> >> clock_gettime(), it looks like transhuge-stress is the only one that doesn't
> >> >> >> have a pass/fail result, so is probably the only candidate for moving.
> >> >> >>
> >> >> >> The others either use the times as a timeout and determines failure if the
> >> >> >> action didn't occur within the timeout (e.g. ksm_tests.c) or use it to add some
> >> >> >> supplemental performance information to an otherwise functionality-oriented test.
> >> >> >
> >> >> > Thank you very much, Ryan. I think you've found a better home for this
> >> >> > tool . I will
> >> >> > send v2, relocating it to tools/mm and adding a function to swap in
> >> >> > either the whole
> >> >> > mTHPs or a portion of mTHPs by "-a"(aligned swapin).
> >> >> >
> >> >> > So basically, we will have
> >> >> >
> >> >> > 1. Use MADV_PAGEPUT for rapid swap-out, putting the swap allocation code under
> >> >> > high exercise in a short time.
> >> >> >
> >> >> > 2. Use MADV_DONTNEED to simulate the behavior of libc and Java heap in freeing
> >> >> > memory, as well as for munmap, app exits, or OOM killer scenarios. This ensures
> >> >> > new mTHP is always generated, released or swapped out, similar to the behavior
> >> >> > on a PC or Android phone where many applications are frequently started and
> >> >> > terminated.
> >> >>
> >> >> MADV_DONTNEED 64KB memory, then memset() it, this just simulates the
> >> >> large folio swap-in exactly, which hasn't been merged by upstream.  I
> >> >> don't think that it's a good idea to make such kind of trick.
> >> >
> >> > I disagree. This is how userspace heaps can manage memory
> >> > deallocation.
> >>
> >> Sorry, I don't understand how.  Can you show some examples?  Such as
> >> strace log with 64KB aligned MADV_DONTNEED?
> >
> > In Java heap and memory allocators such as jemalloc and Scudo, memory is freed
> > using the MADV_DONTNEED flag when either free() is called or garbage collection
> > occurs. In Android, the Java heap is freed in chunks aligned to 64KB
> > or larger.
>
> Originally, I heard about that MADV_FREE is used by jemalloc.  Now, I
> know that they use MADV_DONTNEED too.  Thanks!
>
> Although I still suspect that libc/java allocator will free pages in
> exact 64KB size (IIUC, they should free pages in much larger trunk).  I
> agree that MADV_DONTNEED is a way to create fragmentation in swap
> devices.

Right.

They don't always free memory in exact 64KB sizes or mTHP size, but we
need to define a minimum granularity. Typically, when many objects are
freed, they combine into a larger free block, which is then released to
kernel all at once.

As an example, libc might map lots of 4MB VMAs and classify them into
different size categories—some for small objects and others for larger ones.
While attempts are made to consolidate adjacent free blocks to reduce
system calls, MADV_DONTNEED is often utilized at the minimum granularity
for small objects when merging is temporarily impractical - We don't always
encounter two or more memory blocks where all the objects have been
released :-)


>
> > In
> > Scudo and jemalloc, there is a configuration option to set the
> > management granularity.
> > This granularity is set to match the mTHP size(though the default
> > value is 16KB in the
> > latest Android if we don't run mTHP). Otherwise, you could end up with
> > millions of
> > partial unmap operations, which would severely degrade the performance of mTHP.
> >
> > Imagine libc/Java functioning like a slab allocator. When kfree() is
> > called, some pages
> > may become completely unoccupied and can be returned to the buddy allocator. In
> > userspace, memory is given back to the kernel in a similar manner,
> > typically using
> > MADV_DONTNEED. Therefore, MADV_DONTNEED is the most common memory
> > reclamation behavior in Android, coming with free(), delete() or GC.
> >
> > Imagine a system with extensive malloc, free, new, and delete
> > operations, where objects
> > are constantly being created and destroyed.
> >
> > On the other hand, whether libc/Java use MADV_DONTNEED to free memory is not
> > crucial, although they do. We need a method to simulate the lifecycle
> > of applications
> > —exiting and starting anew—on PCs or Android phones. It doesn't matter if you
> > use MADV_DONTNEED or munmap to achieve this.
> >
> > It is important to note that mTHP currently operates on a one-shot
> > basis(after swap-out,
> > you never get them back as mTHP as we don't support large folios
> > swapin). For the test
> > program, we need a method to generate new mTHPs continuously. Without this,
> > after the initial iterations, we would be left with only folios,
> > rendering the entire
> > test program *pointless*.
>
> I understand the requirements for new mTHPs.
>
> >>
> >> > Additionally, in the event of an application exit, munmap, or OOM killer, the
> >> > amount of freed memory can be much larger than 64KB. The primary purpose
> >> > of using MADV_DONTNEED is to release anonymous memory and generate
> >> > new mTHP so that the iteration can continue. Otherwise, the test program
> >> > becomes entirely pointless, as we only have large folios at the beginning.
> >> > That is exactly why Chris has failed to find his bugs by using other small
> >> > programs.
> >>
> >> Although I still don't understand how 64KB aligned MADV_DONTNEED is used
> >> for libc/java heap or munmap in a practical way.  After more thoughts, I
> >> think 64KB Aligned MADV_DONTNEED can simulate the fragmentation effect
> >> of processes exit at some degree if 64KB folios in these processes are
> >> swapped out without splitting.  If you have no other practical use
> >> cases, I suggest to make it explicit with comments in program.
> >>
>
> [snip]
>
> --
> Best Regards,
> Huang, Ying
diff mbox series

Patch

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index e1aa09ddaa3d..64164ad66835 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -65,6 +65,7 @@  TEST_GEN_FILES += mseal_test
 TEST_GEN_FILES += seal_elf
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
+TEST_GEN_FILES += thp_swap_allocator_test
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/thp_swap_allocator_test.c b/tools/testing/selftests/mm/thp_swap_allocator_test.c
new file mode 100644
index 000000000000..4443a906d0f8
--- /dev/null
+++ b/tools/testing/selftests/mm/thp_swap_allocator_test.c
@@ -0,0 +1,192 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * thp_swap_allocator_test
+ *
+ * The purpose of this test program is helping check if THP swpout
+ * can correctly get swap slots to swap out as a whole instead of
+ * being split. It randomly releases swap entries through madvise
+ * DONTNEED and do swapout on two memory areas: a memory area for
+ * 64KB THP and the other area for small folios. The second memory
+ * can be enabled by "-s".
+ * Before running the program, we need to setup a zRAM or similar
+ * swap device by:
+ *  echo lzo > /sys/block/zram0/comp_algorithm
+ *  echo 64M > /sys/block/zram0/disksize
+ *  echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+ *  echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
+ *  mkswap /dev/zram0
+ *  swapon /dev/zram0
+ * The expected result should be 0% anon swpout fallback ratio w/ or
+ * w/o "-s".
+ *
+ * Author(s): Barry Song <v-songbaohua@oppo.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <time.h>
+
+#define MEMSIZE_MTHP (60 * 1024 * 1024)
+#define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024)
+#define ALIGNMENT_MTHP (64 * 1024)
+#define ALIGNMENT_SMALLFOLIO (4 * 1024)
+#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
+#define TOTAL_DONTNEED_SMALLFOLIO (768 * 1024)
+#define MTHP_FOLIO_SIZE (64 * 1024)
+
+#define SWPOUT_PATH \
+	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
+#define SWPOUT_FALLBACK_PATH \
+	"/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
+
+static void *aligned_alloc_mem(size_t size, size_t alignment)
+{
+	void *mem = NULL;
+
+	if (posix_memalign(&mem, alignment, size) != 0) {
+		perror("posix_memalign");
+		return NULL;
+	}
+	return mem;
+}
+
+static void random_madvise_dontneed(void *mem, size_t mem_size,
+		size_t align_size, size_t total_dontneed_size)
+{
+	size_t num_pages = total_dontneed_size / align_size;
+	size_t i;
+	size_t offset;
+	void *addr;
+
+	for (i = 0; i < num_pages; ++i) {
+		offset = (rand() % (mem_size / align_size)) * align_size;
+		addr = (char *)mem + offset;
+		if (madvise(addr, align_size, MADV_DONTNEED) != 0)
+			perror("madvise dontneed");
+
+		memset(addr, 0x11, align_size);
+	}
+}
+
+static unsigned long read_stat(const char *path)
+{
+	FILE *file;
+	unsigned long value;
+
+	file = fopen(path, "r");
+	if (!file) {
+		perror("fopen");
+		return 0;
+	}
+
+	if (fscanf(file, "%lu", &value) != 1) {
+		perror("fscanf");
+		fclose(file);
+		return 0;
+	}
+
+	fclose(file);
+	return value;
+}
+
+int main(int argc, char *argv[])
+{
+	int use_small_folio = 0;
+	int i;
+	void *mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
+	void *mem2 = NULL;
+
+	if (mem1 == NULL) {
+		fprintf(stderr, "Failed to allocate 60MB memory\n");
+		return EXIT_FAILURE;
+	}
+
+	if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
+		perror("madvise hugepage for mem1");
+		free(mem1);
+		return EXIT_FAILURE;
+	}
+
+	for (i = 1; i < argc; ++i) {
+		if (strcmp(argv[i], "-s") == 0)
+			use_small_folio = 1;
+	}
+
+	if (use_small_folio) {
+		mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
+		if (mem2 == NULL) {
+			fprintf(stderr, "Failed to allocate 1MB memory\n");
+			free(mem1);
+			return EXIT_FAILURE;
+		}
+
+		if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
+			perror("madvise nohugepage for mem2");
+			free(mem1);
+			free(mem2);
+			return EXIT_FAILURE;
+		}
+	}
+
+	for (i = 0; i < 100; ++i) {
+		unsigned long initial_swpout;
+		unsigned long initial_swpout_fallback;
+		unsigned long final_swpout;
+		unsigned long final_swpout_fallback;
+		unsigned long swpout_inc;
+		unsigned long swpout_fallback_inc;
+		double fallback_percentage;
+
+		initial_swpout = read_stat(SWPOUT_PATH);
+		initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+		random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
+				TOTAL_DONTNEED_MTHP);
+
+		if (use_small_folio) {
+			random_madvise_dontneed(mem2, MEMSIZE_SMALLFOLIO,
+					ALIGNMENT_SMALLFOLIO,
+					TOTAL_DONTNEED_SMALLFOLIO);
+		}
+
+		if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
+			perror("madvise pageout for mem1");
+			free(mem1);
+			if (mem2 != NULL)
+				free(mem2);
+			return EXIT_FAILURE;
+		}
+
+		if (use_small_folio) {
+			if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
+				perror("madvise pageout for mem2");
+				free(mem1);
+				free(mem2);
+				return EXIT_FAILURE;
+			}
+		}
+
+		final_swpout = read_stat(SWPOUT_PATH);
+		final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+		swpout_inc = final_swpout - initial_swpout;
+		swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
+
+		fallback_percentage = (double)swpout_fallback_inc /
+			(swpout_fallback_inc + swpout_inc) * 100;
+
+		printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
+				i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
+	}
+
+	free(mem1);
+	if (mem2 != NULL)
+		free(mem2);
+
+	return EXIT_SUCCESS;
+}