Message ID | 20250121215641.1764359-2-joannelkoong@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fstests: test reads/writes from hugepages-backed buffers | expand |
On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > Add support for reads/writes from buffers backed by hugepages. > This can be enabled through the '-h' flag. This flag should only be used > on systems where THP capabilities are enabled. > > This is motivated by a recent bug that was due to faulty handling of > userspace buffers backed by hugepages. This patch is a mitigation > against problems like this in the future. > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > Reviewed-by: Brian Foster <bfoster@redhat.com> > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > --- Those two test cases fail on old system which doesn't support MADV_COLLAPSE: fsx -N 10000 -l 500000 -h -fsx -N 10000 -o 8192 -l 500000 -h -fsx -N 10000 -o 128000 -l 500000 -h +MADV_COLLAPSE not supported. Can't support -h and fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h +mapped writes DISABLED +MADV_COLLAPSE not supported. Can't support -h I'm wondering ... > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 153 insertions(+), 13 deletions(-) > > diff --git a/ltp/fsx.c b/ltp/fsx.c > index 41933354..3be383c6 100644 > --- a/ltp/fsx.c > +++ b/ltp/fsx.c > @@ -190,6 +190,16 @@ int o_direct; /* -Z */ > int aio = 0; > int uring = 0; > int mark_nr = 0; > +int hugepages = 0; /* -h flag */ > + > +/* Stores info needed to periodically collapse hugepages */ > +struct hugepages_collapse_info { > + void *orig_good_buf; > + long good_buf_size; > + void *orig_temp_buf; > + long temp_buf_size; > +}; > +struct hugepages_collapse_info hugepages_info; > > int page_size; > int page_mask; > @@ -2471,7 +2481,7 @@ void > usage(void) > { > fprintf(stdout, "usage: %s", > - "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\ > + "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\ > [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\ > [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\ > [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\ > @@ -2483,8 +2493,11 @@ usage(void) > -d: debug output for all operations\n\ > -e: pollute post-eof on size changes (default 0)\n\ > -f: flush and invalidate cache after I/O\n\ > - -g X: write character X instead of random generated data\n\ > - -i logdev: do integrity testing, logdev is the dm log writes device\n\ > + -g X: write character X instead of random generated data\n" > +#ifdef MADV_COLLAPSE > +" -h hugepages: use buffers backed by hugepages for reads/writes\n" > +#endif > +" -i logdev: do integrity testing, logdev is the dm log writes device\n\ > -j logid: prefix debug log messsages with this id\n\ > -k: do not truncate existing file and use its size as upper bound on file size\n\ > -l flen: the upper bound on file size (default 262144)\n\ > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str) > #endif > } > > +/* > + * Reclaim may break up hugepages, so do a best-effort collapse every once in > + * a while. > + */ > +static void > +collapse_hugepages(void) > +{ > +#ifdef MADV_COLLAPSE > + int ret; > + > + /* re-collapse every 16k fsxops after we start */ > + if (!numops || (numops & ((1U << 14) - 1))) > + return; > + > + ret = madvise(hugepages_info.orig_good_buf, > + hugepages_info.good_buf_size, MADV_COLLAPSE); > + if (ret) > + prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n", > + numops, strerror(errno)); > + ret = madvise(hugepages_info.orig_temp_buf, > + hugepages_info.temp_buf_size, MADV_COLLAPSE); > + if (ret) > + prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n", > + numops, strerror(errno)); > +#endif > +} > + > bool > keep_running(void) > { > int ret; > > + if (hugepages) > + collapse_hugepages(); > + > if (deadline.tv_nsec) { > struct timespec now; > > @@ -2856,6 +2899,103 @@ keep_running(void) > return numops-- != 0; > } > > +static long > +get_hugepage_size(void) > +{ > + const char str[] = "Hugepagesize:"; > + size_t str_len = sizeof(str) - 1; > + unsigned int hugepage_size = 0; > + char buffer[64]; > + FILE *file; > + > + file = fopen("/proc/meminfo", "r"); > + if (!file) { > + prterr("get_hugepage_size: fopen /proc/meminfo"); > + return -1; > + } > + while (fgets(buffer, sizeof(buffer), file)) { > + if (strncmp(buffer, str, str_len) == 0) { > + sscanf(buffer + str_len, "%u", &hugepage_size); > + break; > + } > + } > + fclose(file); > + if (!hugepage_size) { > + prterr("get_hugepage_size: failed to find " > + "hugepage size in /proc/meminfo\n"); > + return -1; > + } > + > + /* convert from KiB to bytes */ > + return hugepage_size << 10; > +} > + > +static void * > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size) > +{ > + void *buf = NULL; > +#ifdef MADV_COLLAPSE > + int ret; > + long size = roundup(len, hugepage_size) + alignment; > + > + ret = posix_memalign(&buf, hugepage_size, size); > + if (ret) { > + prterr("posix_memalign for buf"); > + return NULL; > + } > + memset(buf, '\0', size); > + ret = madvise(buf, size, MADV_COLLAPSE); > + if (ret) { > + prterr("madvise collapse for buf"); > + free(buf); > + return NULL; > + } > + > + *buf_size = size; > +#endif > + return buf; > +} > + > +static void > +init_buffers(void) > +{ > + int i; > + > + original_buf = (char *) malloc(maxfilelen); > + for (i = 0; i < maxfilelen; i++) > + original_buf[i] = random() % 256; > + if (hugepages) { > + long hugepage_size = get_hugepage_size(); > + if (hugepage_size == -1) { > + prterr("get_hugepage_size()"); > + exit(102); > + } > + good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy, > + &hugepages_info.good_buf_size); > + if (!good_buf) { > + prterr("init_hugepages_buf failed for good_buf"); > + exit(103); > + } > + hugepages_info.orig_good_buf = good_buf; > + > + temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy, > + &hugepages_info.temp_buf_size); > + if (!temp_buf) { > + prterr("init_hugepages_buf failed for temp_buf"); > + exit(103); > + } > + hugepages_info.orig_temp_buf = temp_buf; > + } else { > + unsigned long good_buf_len = maxfilelen + writebdy; > + unsigned long temp_buf_len = maxoplen + readbdy; > + > + good_buf = calloc(1, good_buf_len); > + temp_buf = calloc(1, temp_buf_len); > + } > + good_buf = round_ptr_up(good_buf, writebdy, 0); > + temp_buf = round_ptr_up(temp_buf, readbdy, 0); > +} > + > static struct option longopts[] = { > {"replay-ops", required_argument, 0, 256}, > {"record-ops", optional_argument, 0, 255}, > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > while ((ch = getopt_long(argc, argv, > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > longopts, NULL)) != EOF) > switch (ch) { > case 'b': > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > case 'g': > filldata = *optarg; > break; > + case 'h': > +#ifndef MADV_COLLAPSE > + fprintf(stderr, "MADV_COLLAPSE not supported. " > + "Can't support -h\n"); > + exit(86); > +#endif > + hugepages = 1; > + break; ... if we could change this part to: #ifdef MADV_COLLAPSE hugepages = 1; #endif break; to avoid the test failures on old systems. Or any better ideas from you :) Thanks, Zorro > case 'i': > integrity = 1; > logdev = strdup(optarg); > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > exit(95); > } > } > - original_buf = (char *) malloc(maxfilelen); > - for (i = 0; i < maxfilelen; i++) > - original_buf[i] = random() % 256; > - good_buf = (char *) malloc(maxfilelen + writebdy); > - good_buf = round_ptr_up(good_buf, writebdy, 0); > - memset(good_buf, '\0', maxfilelen); > - temp_buf = (char *) malloc(maxoplen + readbdy); > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > - memset(temp_buf, '\0', maxoplen); > + init_buffers(); > if (lite) { /* zero entire existing file */ > ssize_t written; > > -- > 2.47.1 >
On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > Add support for reads/writes from buffers backed by hugepages. > > This can be enabled through the '-h' flag. This flag should only be used > > on systems where THP capabilities are enabled. > > > > This is motivated by a recent bug that was due to faulty handling of > > userspace buffers backed by hugepages. This patch is a mitigation > > against problems like this in the future. > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > --- > > Those two test cases fail on old system which doesn't support > MADV_COLLAPSE: > > fsx -N 10000 -l 500000 -h > -fsx -N 10000 -o 8192 -l 500000 -h > -fsx -N 10000 -o 128000 -l 500000 -h > +MADV_COLLAPSE not supported. Can't support -h > > and > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > +mapped writes DISABLED > +MADV_COLLAPSE not supported. Can't support -h > > I'm wondering ... > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > index 41933354..3be383c6 100644 > > --- a/ltp/fsx.c > > +++ b/ltp/fsx.c > > @@ -190,6 +190,16 @@ int o_direct; /* -Z */ > > int aio = 0; > > int uring = 0; > > int mark_nr = 0; > > +int hugepages = 0; /* -h flag */ > > + > > +/* Stores info needed to periodically collapse hugepages */ > > +struct hugepages_collapse_info { > > + void *orig_good_buf; > > + long good_buf_size; > > + void *orig_temp_buf; > > + long temp_buf_size; > > +}; > > +struct hugepages_collapse_info hugepages_info; > > > > int page_size; > > int page_mask; > > @@ -2471,7 +2481,7 @@ void > > usage(void) > > { > > fprintf(stdout, "usage: %s", > > - "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\ > > + "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\ > > [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\ > > [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\ > > [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\ > > @@ -2483,8 +2493,11 @@ usage(void) > > -d: debug output for all operations\n\ > > -e: pollute post-eof on size changes (default 0)\n\ > > -f: flush and invalidate cache after I/O\n\ > > - -g X: write character X instead of random generated data\n\ > > - -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > + -g X: write character X instead of random generated data\n" > > +#ifdef MADV_COLLAPSE > > +" -h hugepages: use buffers backed by hugepages for reads/writes\n" > > +#endif > > +" -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > -j logid: prefix debug log messsages with this id\n\ > > -k: do not truncate existing file and use its size as upper bound on file size\n\ > > -l flen: the upper bound on file size (default 262144)\n\ > > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str) > > #endif > > } > > > > +/* > > + * Reclaim may break up hugepages, so do a best-effort collapse every once in > > + * a while. > > + */ > > +static void > > +collapse_hugepages(void) > > +{ > > +#ifdef MADV_COLLAPSE > > + int ret; > > + > > + /* re-collapse every 16k fsxops after we start */ > > + if (!numops || (numops & ((1U << 14) - 1))) > > + return; > > + > > + ret = madvise(hugepages_info.orig_good_buf, > > + hugepages_info.good_buf_size, MADV_COLLAPSE); > > + if (ret) > > + prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n", > > + numops, strerror(errno)); > > + ret = madvise(hugepages_info.orig_temp_buf, > > + hugepages_info.temp_buf_size, MADV_COLLAPSE); > > + if (ret) > > + prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n", > > + numops, strerror(errno)); > > +#endif > > +} > > + > > bool > > keep_running(void) > > { > > int ret; > > > > + if (hugepages) > > + collapse_hugepages(); > > + > > if (deadline.tv_nsec) { > > struct timespec now; > > > > @@ -2856,6 +2899,103 @@ keep_running(void) > > return numops-- != 0; > > } > > > > +static long > > +get_hugepage_size(void) > > +{ > > + const char str[] = "Hugepagesize:"; > > + size_t str_len = sizeof(str) - 1; > > + unsigned int hugepage_size = 0; > > + char buffer[64]; > > + FILE *file; > > + > > + file = fopen("/proc/meminfo", "r"); > > + if (!file) { > > + prterr("get_hugepage_size: fopen /proc/meminfo"); > > + return -1; > > + } > > + while (fgets(buffer, sizeof(buffer), file)) { > > + if (strncmp(buffer, str, str_len) == 0) { > > + sscanf(buffer + str_len, "%u", &hugepage_size); > > + break; > > + } > > + } > > + fclose(file); > > + if (!hugepage_size) { > > + prterr("get_hugepage_size: failed to find " > > + "hugepage size in /proc/meminfo\n"); > > + return -1; > > + } > > + > > + /* convert from KiB to bytes */ > > + return hugepage_size << 10; > > +} > > + > > +static void * > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size) > > +{ > > + void *buf = NULL; > > +#ifdef MADV_COLLAPSE > > + int ret; > > + long size = roundup(len, hugepage_size) + alignment; > > + > > + ret = posix_memalign(&buf, hugepage_size, size); > > + if (ret) { > > + prterr("posix_memalign for buf"); > > + return NULL; > > + } > > + memset(buf, '\0', size); > > + ret = madvise(buf, size, MADV_COLLAPSE); > > + if (ret) { > > + prterr("madvise collapse for buf"); > > + free(buf); > > + return NULL; > > + } > > + > > + *buf_size = size; > > +#endif > > + return buf; > > +} > > + > > +static void > > +init_buffers(void) > > +{ > > + int i; > > + > > + original_buf = (char *) malloc(maxfilelen); > > + for (i = 0; i < maxfilelen; i++) > > + original_buf[i] = random() % 256; > > + if (hugepages) { > > + long hugepage_size = get_hugepage_size(); > > + if (hugepage_size == -1) { > > + prterr("get_hugepage_size()"); > > + exit(102); > > + } > > + good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy, > > + &hugepages_info.good_buf_size); > > + if (!good_buf) { > > + prterr("init_hugepages_buf failed for good_buf"); > > + exit(103); > > + } > > + hugepages_info.orig_good_buf = good_buf; > > + > > + temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy, > > + &hugepages_info.temp_buf_size); > > + if (!temp_buf) { > > + prterr("init_hugepages_buf failed for temp_buf"); > > + exit(103); > > + } > > + hugepages_info.orig_temp_buf = temp_buf; > > + } else { > > + unsigned long good_buf_len = maxfilelen + writebdy; > > + unsigned long temp_buf_len = maxoplen + readbdy; > > + > > + good_buf = calloc(1, good_buf_len); > > + temp_buf = calloc(1, temp_buf_len); > > + } > > + good_buf = round_ptr_up(good_buf, writebdy, 0); > > + temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > +} > > + > > static struct option longopts[] = { > > {"replay-ops", required_argument, 0, 256}, > > {"record-ops", optional_argument, 0, 255}, > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > while ((ch = getopt_long(argc, argv, > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > longopts, NULL)) != EOF) > > switch (ch) { > > case 'b': > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > case 'g': > > filldata = *optarg; > > break; > > + case 'h': > > +#ifndef MADV_COLLAPSE > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > + "Can't support -h\n"); > > + exit(86); > > +#endif > > + hugepages = 1; > > + break; > > ... > if we could change this part to: > > #ifdef MADV_COLLAPSE > hugepages = 1; > #endif > break; > > to avoid the test failures on old systems. > > Or any better ideas from you :) Hi Zorro, It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What do you think about skipping generic/758 and generic/759 if the kernel version is older than 6.1? That to me seems more preferable than the paste above, as the paste above would execute the test as if it did test hugepages when in reality it didn't, which would be misleading. Thanks, Joanne > > Thanks, > Zorro > > > case 'i': > > integrity = 1; > > logdev = strdup(optarg); > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > > exit(95); > > } > > } > > - original_buf = (char *) malloc(maxfilelen); > > - for (i = 0; i < maxfilelen; i++) > > - original_buf[i] = random() % 256; > > - good_buf = (char *) malloc(maxfilelen + writebdy); > > - good_buf = round_ptr_up(good_buf, writebdy, 0); > > - memset(good_buf, '\0', maxfilelen); > > - temp_buf = (char *) malloc(maxoplen + readbdy); > > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > - memset(temp_buf, '\0', maxoplen); > > + init_buffers(); > > if (lite) { /* zero entire existing file */ > > ssize_t written; > > > > -- > > 2.47.1 > > >
On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > Add support for reads/writes from buffers backed by hugepages. > > > This can be enabled through the '-h' flag. This flag should only be used > > > on systems where THP capabilities are enabled. > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > userspace buffers backed by hugepages. This patch is a mitigation > > > against problems like this in the future. > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > --- > > > > Those two test cases fail on old system which doesn't support > > MADV_COLLAPSE: > > > > fsx -N 10000 -l 500000 -h > > -fsx -N 10000 -o 8192 -l 500000 -h > > -fsx -N 10000 -o 128000 -l 500000 -h > > +MADV_COLLAPSE not supported. Can't support -h > > > > and > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > +mapped writes DISABLED > > +MADV_COLLAPSE not supported. Can't support -h > > > > I'm wondering ... > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > index 41933354..3be383c6 100644 > > > --- a/ltp/fsx.c > > > +++ b/ltp/fsx.c > > > @@ -190,6 +190,16 @@ int o_direct; /* -Z */ > > > int aio = 0; > > > int uring = 0; > > > int mark_nr = 0; > > > +int hugepages = 0; /* -h flag */ > > > + > > > +/* Stores info needed to periodically collapse hugepages */ > > > +struct hugepages_collapse_info { > > > + void *orig_good_buf; > > > + long good_buf_size; > > > + void *orig_temp_buf; > > > + long temp_buf_size; > > > +}; > > > +struct hugepages_collapse_info hugepages_info; > > > > > > int page_size; > > > int page_mask; > > > @@ -2471,7 +2481,7 @@ void > > > usage(void) > > > { > > > fprintf(stdout, "usage: %s", > > > - "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\ > > > + "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\ > > > [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\ > > > [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\ > > > [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\ > > > @@ -2483,8 +2493,11 @@ usage(void) > > > -d: debug output for all operations\n\ > > > -e: pollute post-eof on size changes (default 0)\n\ > > > -f: flush and invalidate cache after I/O\n\ > > > - -g X: write character X instead of random generated data\n\ > > > - -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > > + -g X: write character X instead of random generated data\n" > > > +#ifdef MADV_COLLAPSE > > > +" -h hugepages: use buffers backed by hugepages for reads/writes\n" > > > +#endif > > > +" -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > > -j logid: prefix debug log messsages with this id\n\ > > > -k: do not truncate existing file and use its size as upper bound on file size\n\ > > > -l flen: the upper bound on file size (default 262144)\n\ > > > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str) > > > #endif > > > } > > > > > > +/* > > > + * Reclaim may break up hugepages, so do a best-effort collapse every once in > > > + * a while. > > > + */ > > > +static void > > > +collapse_hugepages(void) > > > +{ > > > +#ifdef MADV_COLLAPSE > > > + int ret; > > > + > > > + /* re-collapse every 16k fsxops after we start */ > > > + if (!numops || (numops & ((1U << 14) - 1))) > > > + return; > > > + > > > + ret = madvise(hugepages_info.orig_good_buf, > > > + hugepages_info.good_buf_size, MADV_COLLAPSE); > > > + if (ret) > > > + prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n", > > > + numops, strerror(errno)); > > > + ret = madvise(hugepages_info.orig_temp_buf, > > > + hugepages_info.temp_buf_size, MADV_COLLAPSE); > > > + if (ret) > > > + prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n", > > > + numops, strerror(errno)); > > > +#endif > > > +} > > > + > > > bool > > > keep_running(void) > > > { > > > int ret; > > > > > > + if (hugepages) > > > + collapse_hugepages(); > > > + > > > if (deadline.tv_nsec) { > > > struct timespec now; > > > > > > @@ -2856,6 +2899,103 @@ keep_running(void) > > > return numops-- != 0; > > > } > > > > > > +static long > > > +get_hugepage_size(void) > > > +{ > > > + const char str[] = "Hugepagesize:"; > > > + size_t str_len = sizeof(str) - 1; > > > + unsigned int hugepage_size = 0; > > > + char buffer[64]; > > > + FILE *file; > > > + > > > + file = fopen("/proc/meminfo", "r"); > > > + if (!file) { > > > + prterr("get_hugepage_size: fopen /proc/meminfo"); > > > + return -1; > > > + } > > > + while (fgets(buffer, sizeof(buffer), file)) { > > > + if (strncmp(buffer, str, str_len) == 0) { > > > + sscanf(buffer + str_len, "%u", &hugepage_size); > > > + break; > > > + } > > > + } > > > + fclose(file); > > > + if (!hugepage_size) { > > > + prterr("get_hugepage_size: failed to find " > > > + "hugepage size in /proc/meminfo\n"); > > > + return -1; > > > + } > > > + > > > + /* convert from KiB to bytes */ > > > + return hugepage_size << 10; > > > +} > > > + > > > +static void * > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size) > > > +{ > > > + void *buf = NULL; > > > +#ifdef MADV_COLLAPSE > > > + int ret; > > > + long size = roundup(len, hugepage_size) + alignment; > > > + > > > + ret = posix_memalign(&buf, hugepage_size, size); > > > + if (ret) { > > > + prterr("posix_memalign for buf"); > > > + return NULL; > > > + } > > > + memset(buf, '\0', size); > > > + ret = madvise(buf, size, MADV_COLLAPSE); > > > + if (ret) { > > > + prterr("madvise collapse for buf"); > > > + free(buf); > > > + return NULL; > > > + } > > > + > > > + *buf_size = size; > > > +#endif > > > + return buf; > > > +} > > > + > > > +static void > > > +init_buffers(void) > > > +{ > > > + int i; > > > + > > > + original_buf = (char *) malloc(maxfilelen); > > > + for (i = 0; i < maxfilelen; i++) > > > + original_buf[i] = random() % 256; > > > + if (hugepages) { > > > + long hugepage_size = get_hugepage_size(); > > > + if (hugepage_size == -1) { > > > + prterr("get_hugepage_size()"); > > > + exit(102); > > > + } > > > + good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy, > > > + &hugepages_info.good_buf_size); > > > + if (!good_buf) { > > > + prterr("init_hugepages_buf failed for good_buf"); > > > + exit(103); > > > + } > > > + hugepages_info.orig_good_buf = good_buf; > > > + > > > + temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy, > > > + &hugepages_info.temp_buf_size); > > > + if (!temp_buf) { > > > + prterr("init_hugepages_buf failed for temp_buf"); > > > + exit(103); > > > + } > > > + hugepages_info.orig_temp_buf = temp_buf; > > > + } else { > > > + unsigned long good_buf_len = maxfilelen + writebdy; > > > + unsigned long temp_buf_len = maxoplen + readbdy; > > > + > > > + good_buf = calloc(1, good_buf_len); > > > + temp_buf = calloc(1, temp_buf_len); > > > + } > > > + good_buf = round_ptr_up(good_buf, writebdy, 0); > > > + temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > +} > > > + > > > static struct option longopts[] = { > > > {"replay-ops", required_argument, 0, 256}, > > > {"record-ops", optional_argument, 0, 255}, > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > while ((ch = getopt_long(argc, argv, > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > longopts, NULL)) != EOF) > > > switch (ch) { > > > case 'b': > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > case 'g': > > > filldata = *optarg; > > > break; > > > + case 'h': > > > +#ifndef MADV_COLLAPSE > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > + "Can't support -h\n"); > > > + exit(86); > > > +#endif > > > + hugepages = 1; > > > + break; > > > > ... > > if we could change this part to: > > > > #ifdef MADV_COLLAPSE > > hugepages = 1; > > #endif > > break; > > > > to avoid the test failures on old systems. > > > > Or any better ideas from you :) > > Hi Zorro, > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > do you think about skipping generic/758 and generic/759 if the kernel > version is older than 6.1? That to me seems more preferable than the > paste above, as the paste above would execute the test as if it did > test hugepages when in reality it didn't, which would be misleading. Now that I've gotten to try this out -- There's a couple of things going on here. The first is that generic/759 and 760 need to check if invoking fsx -h causes it to spit out the "MADV_COLLAPSE not supported" error and _notrun the test. The second thing is that userspace programs can ensure the existence of MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, which requires that the underlying C library headers are new enough to include a definition. glibc 2.37 is new enough, but even things like Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries might not follow glibc's practice of wrapping and/or redefining symbols in a way that you hope is the same as... The second way is through linux/mman.h, which comes from the kernel headers package; and the third way is for the program to define it itself if nobody else does. So I think the easiest way to fix the fsx.c build is to include linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these details when I reviewed your patch; I'm a little attention constrained ATM trying to get a large pile of bugfixes and redesigns reviewed so for-next can finally move forward again. --D > > Thanks, > Joanne > > > > > Thanks, > > Zorro > > > > > case 'i': > > > integrity = 1; > > > logdev = strdup(optarg); > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > > > exit(95); > > > } > > > } > > > - original_buf = (char *) malloc(maxfilelen); > > > - for (i = 0; i < maxfilelen; i++) > > > - original_buf[i] = random() % 256; > > > - good_buf = (char *) malloc(maxfilelen + writebdy); > > > - good_buf = round_ptr_up(good_buf, writebdy, 0); > > > - memset(good_buf, '\0', maxfilelen); > > > - temp_buf = (char *) malloc(maxoplen + readbdy); > > > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > - memset(temp_buf, '\0', maxoplen); > > > + init_buffers(); > > > if (lite) { /* zero entire existing file */ > > > ssize_t written; > > > > > > -- > > > 2.47.1 > > > > >
On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > Add support for reads/writes from buffers backed by hugepages. > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > on systems where THP capabilities are enabled. > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > against problems like this in the future. > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > --- > > > > > > Those two test cases fail on old system which doesn't support > > > MADV_COLLAPSE: > > > > > > fsx -N 10000 -l 500000 -h > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > and > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > +mapped writes DISABLED > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > I'm wondering ... > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > index 41933354..3be383c6 100644 > > > > --- a/ltp/fsx.c > > > > +++ b/ltp/fsx.c > > > > static struct option longopts[] = { > > > > {"replay-ops", required_argument, 0, 256}, > > > > {"record-ops", optional_argument, 0, 255}, > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > longopts, NULL)) != EOF) > > > > switch (ch) { > > > > case 'b': > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > case 'g': > > > > filldata = *optarg; > > > > break; > > > > + case 'h': > > > > +#ifndef MADV_COLLAPSE > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > + "Can't support -h\n"); > > > > + exit(86); > > > > +#endif > > > > + hugepages = 1; > > > > + break; > > > > > > ... > > > if we could change this part to: > > > > > > #ifdef MADV_COLLAPSE > > > hugepages = 1; > > > #endif > > > break; > > > > > > to avoid the test failures on old systems. > > > > > > Or any better ideas from you :) > > > > Hi Zorro, > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > do you think about skipping generic/758 and generic/759 if the kernel > > version is older than 6.1? That to me seems more preferable than the > > paste above, as the paste above would execute the test as if it did > > test hugepages when in reality it didn't, which would be misleading. > > Now that I've gotten to try this out -- > > There's a couple of things going on here. The first is that generic/759 > and 760 need to check if invoking fsx -h causes it to spit out the > "MADV_COLLAPSE not supported" error and _notrun the test. > > The second thing is that userspace programs can ensure the existence of > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > which requires that the underlying C library headers are new enough to > include a definition. glibc 2.37 is new enough, but even things like > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > might not follow glibc's practice of wrapping and/or redefining symbols > in a way that you hope is the same as... > > The second way is through linux/mman.h, which comes from the kernel > headers package; and the third way is for the program to define it > itself if nobody else does. > > So I think the easiest way to fix the fsx.c build is to include > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? For generic/758 and 759, does it suffice to gate this on whether the kernel version if 6.1+ and _notrun if not? My understanding is that any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel headers package and support the feature. Thanks, Joanne > details when I reviewed your patch; I'm a little attention constrained > ATM trying to get a large pile of bugfixes and redesigns reviewed so > for-next can finally move forward again. > > --D > > > > > Thanks, > > Joanne > > > > > > > > Thanks, > > > Zorro > > > > > > > case 'i': > > > > integrity = 1; > > > > logdev = strdup(optarg); > > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > > > > exit(95); > > > > } > > > > } > > > > - original_buf = (char *) malloc(maxfilelen); > > > > - for (i = 0; i < maxfilelen; i++) > > > > - original_buf[i] = random() % 256; > > > > - good_buf = (char *) malloc(maxfilelen + writebdy); > > > > - good_buf = round_ptr_up(good_buf, writebdy, 0); > > > > - memset(good_buf, '\0', maxfilelen); > > > > - temp_buf = (char *) malloc(maxoplen + readbdy); > > > > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > > - memset(temp_buf, '\0', maxoplen); > > > > + init_buffers(); > > > > if (lite) { /* zero entire existing file */ > > > > ssize_t written; > > > > > > > > -- > > > > 2.47.1 > > > > > > >
On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > against problems like this in the future. > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > --- > > > > > > > > Those two test cases fail on old system which doesn't support > > > > MADV_COLLAPSE: > > > > > > > > fsx -N 10000 -l 500000 -h > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > and > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > +mapped writes DISABLED > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > I'm wondering ... > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > index 41933354..3be383c6 100644 > > > > > --- a/ltp/fsx.c > > > > > +++ b/ltp/fsx.c > > > > > static struct option longopts[] = { > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > longopts, NULL)) != EOF) > > > > > switch (ch) { > > > > > case 'b': > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > case 'g': > > > > > filldata = *optarg; > > > > > break; > > > > > + case 'h': > > > > > +#ifndef MADV_COLLAPSE > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > + "Can't support -h\n"); > > > > > + exit(86); > > > > > +#endif > > > > > + hugepages = 1; > > > > > + break; > > > > > > > > ... > > > > if we could change this part to: > > > > > > > > #ifdef MADV_COLLAPSE > > > > hugepages = 1; > > > > #endif > > > > break; > > > > > > > > to avoid the test failures on old systems. > > > > > > > > Or any better ideas from you :) > > > > > > Hi Zorro, > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > do you think about skipping generic/758 and generic/759 if the kernel > > > version is older than 6.1? That to me seems more preferable than the > > > paste above, as the paste above would execute the test as if it did > > > test hugepages when in reality it didn't, which would be misleading. > > > > Now that I've gotten to try this out -- > > > > There's a couple of things going on here. The first is that generic/759 > > and 760 need to check if invoking fsx -h causes it to spit out the > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > The second thing is that userspace programs can ensure the existence of > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > which requires that the underlying C library headers are new enough to > > include a definition. glibc 2.37 is new enough, but even things like > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > might not follow glibc's practice of wrapping and/or redefining symbols > > in a way that you hope is the same as... > > > > The second way is through linux/mman.h, which comes from the kernel > > headers package; and the third way is for the program to define it > > itself if nobody else does. > > > > So I think the easiest way to fix the fsx.c build is to include > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? Yes, because glibc provides the mmap() function that wraps syscall(__NR_mmap, ...); > For generic/758 and 759, does it suffice to gate this on whether the > kernel version if 6.1+ and _notrun if not? My understanding is that > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > headers package and support the feature. No, because some (most?) vendors backport new features into existing kernels without revving the version number of that kernel. Maybe the following fixes things? --D generic/759,760: fix MADV_COLLAPSE detection and inclusion On systems with "old" C libraries such as glibc 2.36 in Debian 12, the MADV_COLLAPSE flag might not be defined in any of the header files pulled in by sys/mman.h, which means that the fsx binary might not get built with any of the MADV_COLLAPSE code. If the kernel supports THP, the test will fail with: > QA output created by 760 > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > +mapped writes DISABLED > +MADV_COLLAPSE not supported. Can't support -h Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, then fix fsx.c to include the mman.h from the kernel headers (aka linux/mman.h) so that we can actually test IOs to and from THPs if the kernel is newer than the rest of userspace. Cc: <fstests@vger.kernel.org> # v2025.02.02 Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> --- ltp/fsx.c | 1 + tests/generic/759 | 3 +++ tests/generic/760 | 3 +++ 3 files changed, 7 insertions(+) diff --git a/ltp/fsx.c b/ltp/fsx.c index 634c496ffe9317..cf9502a74c17a7 100644 --- a/ltp/fsx.c +++ b/ltp/fsx.c @@ -20,6 +20,7 @@ #include <strings.h> #include <sys/file.h> #include <sys/mman.h> +#include <linux/mman.h> #include <sys/uio.h> #include <stdbool.h> #ifdef HAVE_ERR_H diff --git a/tests/generic/759 b/tests/generic/759 index 6c74478aa8a0e0..3549c5ed6a9299 100755 --- a/tests/generic/759 +++ b/tests/generic/759 @@ -14,6 +14,9 @@ _begin_fstest rw auto quick _require_test _require_thp +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ + _notrun "fsx binary does not support MADV_COLLAPSE" + run_fsx -N 10000 -l 500000 -h run_fsx -N 10000 -o 8192 -l 500000 -h run_fsx -N 10000 -o 128000 -l 500000 -h diff --git a/tests/generic/760 b/tests/generic/760 index c71a196222ad3b..2fbd28502ae678 100755 --- a/tests/generic/760 +++ b/tests/generic/760 @@ -15,6 +15,9 @@ _require_test _require_odirect _require_thp +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ + _notrun "fsx binary does not support MADV_COLLAPSE" + psize=`$here/src/feature -s` bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > against problems like this in the future. > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > --- > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > MADV_COLLAPSE: > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > and > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > +mapped writes DISABLED > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > I'm wondering ... > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > index 41933354..3be383c6 100644 > > > > > > --- a/ltp/fsx.c > > > > > > +++ b/ltp/fsx.c > > > > > > static struct option longopts[] = { > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > longopts, NULL)) != EOF) > > > > > > switch (ch) { > > > > > > case 'b': > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > case 'g': > > > > > > filldata = *optarg; > > > > > > break; > > > > > > + case 'h': > > > > > > +#ifndef MADV_COLLAPSE > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > + "Can't support -h\n"); > > > > > > + exit(86); > > > > > > +#endif > > > > > > + hugepages = 1; > > > > > > + break; > > > > > > > > > > ... > > > > > if we could change this part to: > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > hugepages = 1; > > > > > #endif > > > > > break; > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > Or any better ideas from you :) > > > > > > > > Hi Zorro, > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > version is older than 6.1? That to me seems more preferable than the > > > > paste above, as the paste above would execute the test as if it did > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > Now that I've gotten to try this out -- > > > > > > There's a couple of things going on here. The first is that generic/759 > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > The second thing is that userspace programs can ensure the existence of > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > which requires that the underlying C library headers are new enough to > > > include a definition. glibc 2.37 is new enough, but even things like > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > in a way that you hope is the same as... > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > headers package; and the third way is for the program to define it > > > itself if nobody else does. > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > Yes, because glibc provides the mmap() function that wraps > syscall(__NR_mmap, ...); > > > For generic/758 and 759, does it suffice to gate this on whether the > > kernel version if 6.1+ and _notrun if not? My understanding is that > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > headers package and support the feature. > > No, because some (most?) vendors backport new features into existing > kernels without revving the version number of that kernel. Oh okay, I see. That makes sense, thanks for the explanation. > > Maybe the following fixes things? > > --D > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > MADV_COLLAPSE flag might not be defined in any of the header files > pulled in by sys/mman.h, which means that the fsx binary might not get > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > the test will fail with: > > > QA output created by 760 > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > +mapped writes DISABLED > > +MADV_COLLAPSE not supported. Can't support -h > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > then fix fsx.c to include the mman.h from the kernel headers (aka > linux/mman.h) so that we can actually test IOs to and from THPs if the > kernel is newer than the rest of userspace. > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > --- > ltp/fsx.c | 1 + > tests/generic/759 | 3 +++ > tests/generic/760 | 3 +++ > 3 files changed, 7 insertions(+) > > diff --git a/ltp/fsx.c b/ltp/fsx.c > index 634c496ffe9317..cf9502a74c17a7 100644 > --- a/ltp/fsx.c > +++ b/ltp/fsx.c > @@ -20,6 +20,7 @@ > #include <strings.h> > #include <sys/file.h> > #include <sys/mman.h> > +#include <linux/mman.h> > #include <sys/uio.h> > #include <stdbool.h> > #ifdef HAVE_ERR_H > diff --git a/tests/generic/759 b/tests/generic/759 > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > --- a/tests/generic/759 > +++ b/tests/generic/759 > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > _require_test > _require_thp > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > + _notrun "fsx binary does not support MADV_COLLAPSE" > + > run_fsx -N 10000 -l 500000 -h > run_fsx -N 10000 -o 8192 -l 500000 -h > run_fsx -N 10000 -o 128000 -l 500000 -h > diff --git a/tests/generic/760 b/tests/generic/760 > index c71a196222ad3b..2fbd28502ae678 100755 > --- a/tests/generic/760 > +++ b/tests/generic/760 > @@ -15,6 +15,9 @@ _require_test > _require_odirect > _require_thp > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > + _notrun "fsx binary does not support MADV_COLLAPSE" > + I tried this out locally and it works for me: generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE Ran: generic/759 Not run: generic/759 Passed all 1 tests SECTION -- fuse ========================= Ran: generic/759 Not run: generic/759 Passed all 1 tests Thanks, Joanne > psize=`$here/src/feature -s` > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` >
On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > --- > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > and > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > +mapped writes DISABLED > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > index 41933354..3be383c6 100644 > > > > > > > --- a/ltp/fsx.c > > > > > > > +++ b/ltp/fsx.c > > > > > > > static struct option longopts[] = { > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > longopts, NULL)) != EOF) > > > > > > > switch (ch) { > > > > > > > case 'b': > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > case 'g': > > > > > > > filldata = *optarg; > > > > > > > break; > > > > > > > + case 'h': > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > + "Can't support -h\n"); > > > > > > > + exit(86); > > > > > > > +#endif > > > > > > > + hugepages = 1; > > > > > > > + break; > > > > > > > > > > > > ... > > > > > > if we could change this part to: > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > hugepages = 1; > > > > > > #endif > > > > > > break; > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > Hi Zorro, > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > paste above, as the paste above would execute the test as if it did > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > which requires that the underlying C library headers are new enough to > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > in a way that you hope is the same as... > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > headers package; and the third way is for the program to define it > > > > itself if nobody else does. > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > Yes, because glibc provides the mmap() function that wraps > > syscall(__NR_mmap, ...); > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > headers package and support the feature. > > > > No, because some (most?) vendors backport new features into existing > > kernels without revving the version number of that kernel. > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > Maybe the following fixes things? > > > > --D > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > MADV_COLLAPSE flag might not be defined in any of the header files > > pulled in by sys/mman.h, which means that the fsx binary might not get > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > the test will fail with: > > > > > QA output created by 760 > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > +mapped writes DISABLED > > > +MADV_COLLAPSE not supported. Can't support -h > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > then fix fsx.c to include the mman.h from the kernel headers (aka > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > kernel is newer than the rest of userspace. > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > --- > > ltp/fsx.c | 1 + > > tests/generic/759 | 3 +++ > > tests/generic/760 | 3 +++ > > 3 files changed, 7 insertions(+) > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > index 634c496ffe9317..cf9502a74c17a7 100644 > > --- a/ltp/fsx.c > > +++ b/ltp/fsx.c > > @@ -20,6 +20,7 @@ > > #include <strings.h> > > #include <sys/file.h> > > #include <sys/mman.h> > > +#include <linux/mman.h> > > #include <sys/uio.h> > > #include <stdbool.h> > > #ifdef HAVE_ERR_H > > diff --git a/tests/generic/759 b/tests/generic/759 > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > --- a/tests/generic/759 > > +++ b/tests/generic/759 > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > _require_test > > _require_thp > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > + > > run_fsx -N 10000 -l 500000 -h > > run_fsx -N 10000 -o 8192 -l 500000 -h > > run_fsx -N 10000 -o 128000 -l 500000 -h > > diff --git a/tests/generic/760 b/tests/generic/760 > > index c71a196222ad3b..2fbd28502ae678 100755 > > --- a/tests/generic/760 > > +++ b/tests/generic/760 > > @@ -15,6 +15,9 @@ _require_test > > _require_odirect > > _require_thp > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > + > > I tried this out locally and it works for me: > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > Ran: generic/759 > Not run: generic/759 > Passed all 1 tests > > SECTION -- fuse > ========================= > Ran: generic/759 > Not run: generic/759 > Passed all 1 tests Does the test actually run if you /do/ have kernel/libc headers that define MADV_COLLAPSE? And if so, does that count as a Tested-by? --D > > Thanks, > Joanne > > > psize=`$here/src/feature -s` > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > >
On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > --- > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > and > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > +mapped writes DISABLED > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > --- a/ltp/fsx.c > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > static struct option longopts[] = { > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > switch (ch) { > > > > > > > > case 'b': > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > case 'g': > > > > > > > > filldata = *optarg; > > > > > > > > break; > > > > > > > > + case 'h': > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > + "Can't support -h\n"); > > > > > > > > + exit(86); > > > > > > > > +#endif > > > > > > > > + hugepages = 1; > > > > > > > > + break; > > > > > > > > > > > > > > ... > > > > > > > if we could change this part to: > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > hugepages = 1; > > > > > > > #endif > > > > > > > break; > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > which requires that the underlying C library headers are new enough to > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > in a way that you hope is the same as... > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > headers package; and the third way is for the program to define it > > > > > itself if nobody else does. > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > Yes, because glibc provides the mmap() function that wraps > > > syscall(__NR_mmap, ...); > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > headers package and support the feature. > > > > > > No, because some (most?) vendors backport new features into existing > > > kernels without revving the version number of that kernel. > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > Maybe the following fixes things? > > > > > > --D > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > the test will fail with: > > > > > > > QA output created by 760 > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > +mapped writes DISABLED > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > kernel is newer than the rest of userspace. > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > --- > > > ltp/fsx.c | 1 + > > > tests/generic/759 | 3 +++ > > > tests/generic/760 | 3 +++ > > > 3 files changed, 7 insertions(+) > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > --- a/ltp/fsx.c > > > +++ b/ltp/fsx.c > > > @@ -20,6 +20,7 @@ > > > #include <strings.h> > > > #include <sys/file.h> > > > #include <sys/mman.h> > > > +#include <linux/mman.h> > > > #include <sys/uio.h> > > > #include <stdbool.h> > > > #ifdef HAVE_ERR_H > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > --- a/tests/generic/759 > > > +++ b/tests/generic/759 > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > _require_test > > > _require_thp > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > + > > > run_fsx -N 10000 -l 500000 -h > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > --- a/tests/generic/760 > > > +++ b/tests/generic/760 > > > @@ -15,6 +15,9 @@ _require_test > > > _require_odirect > > > _require_thp > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > + > > > > I tried this out locally and it works for me: > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > Ran: generic/759 > > Not run: generic/759 > > Passed all 1 tests > > > > SECTION -- fuse > > ========================= > > Ran: generic/759 > > Not run: generic/759 > > Passed all 1 tests > > Does the test actually run if you /do/ have kernel/libc headers that > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > I'm not sure if I fully understand the subtext of your question but yes, the test runs on my system when MADV_COLLAPSE is defined in the kernel/libc headers. For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), I tried this out by modifying the ifdef/ifndef checks in fsx to MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported'' line catches that. > --D > > > > > Thanks, > > Joanne > > > > > psize=`$here/src/feature -s` > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > >
On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote: > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > --- > > > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > and > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > +mapped writes DISABLED > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > > --- a/ltp/fsx.c > > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > > static struct option longopts[] = { > > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > > switch (ch) { > > > > > > > > > case 'b': > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > > case 'g': > > > > > > > > > filldata = *optarg; > > > > > > > > > break; > > > > > > > > > + case 'h': > > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > > + "Can't support -h\n"); > > > > > > > > > + exit(86); > > > > > > > > > +#endif > > > > > > > > > + hugepages = 1; > > > > > > > > > + break; > > > > > > > > > > > > > > > > ... > > > > > > > > if we could change this part to: > > > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > > hugepages = 1; > > > > > > > > #endif > > > > > > > > break; > > > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > > which requires that the underlying C library headers are new enough to > > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > > in a way that you hope is the same as... > > > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > > headers package; and the third way is for the program to define it > > > > > > itself if nobody else does. > > > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > > > Yes, because glibc provides the mmap() function that wraps > > > > syscall(__NR_mmap, ...); > > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > > headers package and support the feature. > > > > > > > > No, because some (most?) vendors backport new features into existing > > > > kernels without revving the version number of that kernel. > > > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > > > > Maybe the following fixes things? > > > > > > > > --D > > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > > the test will fail with: > > > > > > > > > QA output created by 760 > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > +mapped writes DISABLED > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > > kernel is newer than the rest of userspace. > > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > --- > > > > ltp/fsx.c | 1 + > > > > tests/generic/759 | 3 +++ > > > > tests/generic/760 | 3 +++ > > > > 3 files changed, 7 insertions(+) > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > > --- a/ltp/fsx.c > > > > +++ b/ltp/fsx.c > > > > @@ -20,6 +20,7 @@ > > > > #include <strings.h> > > > > #include <sys/file.h> > > > > #include <sys/mman.h> > > > > +#include <linux/mman.h> > > > > #include <sys/uio.h> > > > > #include <stdbool.h> > > > > #ifdef HAVE_ERR_H > > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > > --- a/tests/generic/759 > > > > +++ b/tests/generic/759 > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > > _require_test > > > > _require_thp > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > + > > > > run_fsx -N 10000 -l 500000 -h > > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > > --- a/tests/generic/760 > > > > +++ b/tests/generic/760 > > > > @@ -15,6 +15,9 @@ _require_test > > > > _require_odirect > > > > _require_thp > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > + > > > > > > I tried this out locally and it works for me: > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > > Ran: generic/759 > > > Not run: generic/759 > > > Passed all 1 tests > > > > > > SECTION -- fuse > > > ========================= > > > Ran: generic/759 > > > Not run: generic/759 > > > Passed all 1 tests > > > > Does the test actually run if you /do/ have kernel/libc headers that > > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > > > > I'm not sure if I fully understand the subtext of your question but > yes, the test runs on my system when MADV_COLLAPSE is defined in the > kernel/libc headers. Yep, you understood me correctly. :) > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), > I tried this out by modifying the ifdef/ifndef checks in fsx to > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | > grep -q 'MADV_COLLAPSE not supported'' line catches that. <nod> Sounds good then; I'll add this to my test clod and run it overnight. --D > > > --D > > > > > > > > Thanks, > > > Joanne > > > > > > > psize=`$here/src/feature -s` > > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > > > >
On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote: > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote: > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > --- > > > > > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > and > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > +mapped writes DISABLED > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > > > --- a/ltp/fsx.c > > > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > > > static struct option longopts[] = { > > > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > > > switch (ch) { > > > > > > > > > > case 'b': > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > > > case 'g': > > > > > > > > > > filldata = *optarg; > > > > > > > > > > break; > > > > > > > > > > + case 'h': > > > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > > > + "Can't support -h\n"); > > > > > > > > > > + exit(86); > > > > > > > > > > +#endif > > > > > > > > > > + hugepages = 1; > > > > > > > > > > + break; > > > > > > > > > > > > > > > > > > ... > > > > > > > > > if we could change this part to: > > > > > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > > > hugepages = 1; > > > > > > > > > #endif > > > > > > > > > break; > > > > > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > > > which requires that the underlying C library headers are new enough to > > > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > > > in a way that you hope is the same as... > > > > > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > > > headers package; and the third way is for the program to define it > > > > > > > itself if nobody else does. > > > > > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > > > > > Yes, because glibc provides the mmap() function that wraps > > > > > syscall(__NR_mmap, ...); > > > > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > > > headers package and support the feature. > > > > > > > > > > No, because some (most?) vendors backport new features into existing > > > > > kernels without revving the version number of that kernel. > > > > > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > > > > > > > Maybe the following fixes things? > > > > > > > > > > --D > > > > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > > > the test will fail with: > > > > > > > > > > > QA output created by 760 > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > +mapped writes DISABLED > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > > > kernel is newer than the rest of userspace. > > > > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > --- > > > > > ltp/fsx.c | 1 + > > > > > tests/generic/759 | 3 +++ > > > > > tests/generic/760 | 3 +++ > > > > > 3 files changed, 7 insertions(+) > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > > > --- a/ltp/fsx.c > > > > > +++ b/ltp/fsx.c > > > > > @@ -20,6 +20,7 @@ > > > > > #include <strings.h> > > > > > #include <sys/file.h> > > > > > #include <sys/mman.h> > > > > > +#include <linux/mman.h> > > > > > #include <sys/uio.h> > > > > > #include <stdbool.h> > > > > > #ifdef HAVE_ERR_H > > > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > > > --- a/tests/generic/759 > > > > > +++ b/tests/generic/759 > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > > > _require_test > > > > > _require_thp > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > + > > > > > run_fsx -N 10000 -l 500000 -h > > > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > > > --- a/tests/generic/760 > > > > > +++ b/tests/generic/760 > > > > > @@ -15,6 +15,9 @@ _require_test > > > > > _require_odirect > > > > > _require_thp > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > + > > > > > > > > I tried this out locally and it works for me: > > > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > > > Ran: generic/759 > > > > Not run: generic/759 > > > > Passed all 1 tests > > > > > > > > SECTION -- fuse > > > > ========================= > > > > Ran: generic/759 > > > > Not run: generic/759 > > > > Passed all 1 tests > > > > > > Does the test actually run if you /do/ have kernel/libc headers that > > > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > > > > > > > I'm not sure if I fully understand the subtext of your question but > > yes, the test runs on my system when MADV_COLLAPSE is defined in the > > kernel/libc headers. > > Yep, you understood me correctly. :) > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), > > I tried this out by modifying the ifdef/ifndef checks in fsx to > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | > > grep -q 'MADV_COLLAPSE not supported'' line catches that. > > <nod> Sounds good then; I'll add this to my test clod and run it > overnight. The arm64 vms with 64k base pages spat out this: --- /run/fstests/bin/tests/generic/759.out 2025-02-02 08:36:28.007248055 -0800 +++ /var/tmp/fstests/generic/759.out.bad 2025-02-03 16:51:34.862964640 -0800 @@ -1,4 +1,5 @@ QA output created by 759 fsx -N 10000 -l 500000 -h -fsx -N 10000 -o 8192 -l 500000 -h -fsx -N 10000 -o 128000 -l 500000 -h +Seed set to 1 +madvise collapse for buf: Cannot allocate memory +init_hugepages_buf failed for good_buf: Cannot allocate memory Note that it was trying to create a 512M page(!) on a VM with 8G of memory. Normally one doesn't use large base page size in low memory environments, but this /is/ fstestsland. 8-) From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") : ENOMEM Memory allocation failed or VMA not found EBUSY Memcg charging failed EAGAIN Required resource temporarily unavailable. Try again might succeed. EINVAL Other error: No PMD found, subpage doesn't have Present bit set, "Special" page no backed by struct page, VMA incorrectly sized, address not page-aligned, ... It sounds like ENOMEM/EBUSY/EINVAL should result in _notrun "Could not generate hugepage" ? What are your thoughts? --D > --D > > > > > > --D > > > > > > > > > > > Thanks, > > > > Joanne > > > > > > > > > psize=`$here/src/feature -s` > > > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > > > > > > >
On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > Add support for reads/writes from buffers backed by hugepages. > > > This can be enabled through the '-h' flag. This flag should only be used > > > on systems where THP capabilities are enabled. > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > userspace buffers backed by hugepages. This patch is a mitigation > > > against problems like this in the future. > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > --- > > > > Those two test cases fail on old system which doesn't support > > MADV_COLLAPSE: > > > > fsx -N 10000 -l 500000 -h > > -fsx -N 10000 -o 8192 -l 500000 -h > > -fsx -N 10000 -o 128000 -l 500000 -h > > +MADV_COLLAPSE not supported. Can't support -h > > > > and > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > +mapped writes DISABLED > > +MADV_COLLAPSE not supported. Can't support -h > > > > I'm wondering ... > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > index 41933354..3be383c6 100644 > > > --- a/ltp/fsx.c > > > +++ b/ltp/fsx.c > > > @@ -190,6 +190,16 @@ int o_direct; /* -Z */ > > > int aio = 0; > > > int uring = 0; > > > int mark_nr = 0; > > > +int hugepages = 0; /* -h flag */ > > > + > > > +/* Stores info needed to periodically collapse hugepages */ > > > +struct hugepages_collapse_info { > > > + void *orig_good_buf; > > > + long good_buf_size; > > > + void *orig_temp_buf; > > > + long temp_buf_size; > > > +}; > > > +struct hugepages_collapse_info hugepages_info; > > > > > > int page_size; > > > int page_mask; > > > @@ -2471,7 +2481,7 @@ void > > > usage(void) > > > { > > > fprintf(stdout, "usage: %s", > > > - "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\ > > > + "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\ > > > [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\ > > > [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\ > > > [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\ > > > @@ -2483,8 +2493,11 @@ usage(void) > > > -d: debug output for all operations\n\ > > > -e: pollute post-eof on size changes (default 0)\n\ > > > -f: flush and invalidate cache after I/O\n\ > > > - -g X: write character X instead of random generated data\n\ > > > - -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > > + -g X: write character X instead of random generated data\n" > > > +#ifdef MADV_COLLAPSE > > > +" -h hugepages: use buffers backed by hugepages for reads/writes\n" > > > +#endif > > > +" -i logdev: do integrity testing, logdev is the dm log writes device\n\ > > > -j logid: prefix debug log messsages with this id\n\ > > > -k: do not truncate existing file and use its size as upper bound on file size\n\ > > > -l flen: the upper bound on file size (default 262144)\n\ > > > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str) > > > #endif > > > } > > > > > > +/* > > > + * Reclaim may break up hugepages, so do a best-effort collapse every once in > > > + * a while. > > > + */ > > > +static void > > > +collapse_hugepages(void) > > > +{ > > > +#ifdef MADV_COLLAPSE > > > + int ret; > > > + > > > + /* re-collapse every 16k fsxops after we start */ > > > + if (!numops || (numops & ((1U << 14) - 1))) > > > + return; > > > + > > > + ret = madvise(hugepages_info.orig_good_buf, > > > + hugepages_info.good_buf_size, MADV_COLLAPSE); > > > + if (ret) > > > + prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n", > > > + numops, strerror(errno)); > > > + ret = madvise(hugepages_info.orig_temp_buf, > > > + hugepages_info.temp_buf_size, MADV_COLLAPSE); > > > + if (ret) > > > + prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n", > > > + numops, strerror(errno)); > > > +#endif > > > +} > > > + > > > bool > > > keep_running(void) > > > { > > > int ret; > > > > > > + if (hugepages) > > > + collapse_hugepages(); > > > + > > > if (deadline.tv_nsec) { > > > struct timespec now; > > > > > > @@ -2856,6 +2899,103 @@ keep_running(void) > > > return numops-- != 0; > > > } > > > > > > +static long > > > +get_hugepage_size(void) > > > +{ > > > + const char str[] = "Hugepagesize:"; > > > + size_t str_len = sizeof(str) - 1; > > > + unsigned int hugepage_size = 0; > > > + char buffer[64]; > > > + FILE *file; > > > + > > > + file = fopen("/proc/meminfo", "r"); > > > + if (!file) { > > > + prterr("get_hugepage_size: fopen /proc/meminfo"); > > > + return -1; > > > + } > > > + while (fgets(buffer, sizeof(buffer), file)) { > > > + if (strncmp(buffer, str, str_len) == 0) { > > > + sscanf(buffer + str_len, "%u", &hugepage_size); > > > + break; > > > + } > > > + } > > > + fclose(file); > > > + if (!hugepage_size) { > > > + prterr("get_hugepage_size: failed to find " > > > + "hugepage size in /proc/meminfo\n"); > > > + return -1; > > > + } > > > + > > > + /* convert from KiB to bytes */ > > > + return hugepage_size << 10; > > > +} > > > + > > > +static void * > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size) > > > +{ > > > + void *buf = NULL; > > > +#ifdef MADV_COLLAPSE > > > + int ret; > > > + long size = roundup(len, hugepage_size) + alignment; > > > + > > > + ret = posix_memalign(&buf, hugepage_size, size); > > > + if (ret) { > > > + prterr("posix_memalign for buf"); > > > + return NULL; > > > + } > > > + memset(buf, '\0', size); > > > + ret = madvise(buf, size, MADV_COLLAPSE); > > > + if (ret) { > > > + prterr("madvise collapse for buf"); > > > + free(buf); > > > + return NULL; > > > + } > > > + > > > + *buf_size = size; > > > +#endif > > > + return buf; > > > +} > > > + > > > +static void > > > +init_buffers(void) > > > +{ > > > + int i; > > > + > > > + original_buf = (char *) malloc(maxfilelen); > > > + for (i = 0; i < maxfilelen; i++) > > > + original_buf[i] = random() % 256; > > > + if (hugepages) { > > > + long hugepage_size = get_hugepage_size(); > > > + if (hugepage_size == -1) { > > > + prterr("get_hugepage_size()"); > > > + exit(102); > > > + } > > > + good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy, > > > + &hugepages_info.good_buf_size); > > > + if (!good_buf) { > > > + prterr("init_hugepages_buf failed for good_buf"); > > > + exit(103); > > > + } > > > + hugepages_info.orig_good_buf = good_buf; > > > + > > > + temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy, > > > + &hugepages_info.temp_buf_size); > > > + if (!temp_buf) { > > > + prterr("init_hugepages_buf failed for temp_buf"); > > > + exit(103); > > > + } > > > + hugepages_info.orig_temp_buf = temp_buf; > > > + } else { > > > + unsigned long good_buf_len = maxfilelen + writebdy; > > > + unsigned long temp_buf_len = maxoplen + readbdy; > > > + > > > + good_buf = calloc(1, good_buf_len); > > > + temp_buf = calloc(1, temp_buf_len); > > > + } > > > + good_buf = round_ptr_up(good_buf, writebdy, 0); > > > + temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > +} > > > + > > > static struct option longopts[] = { > > > {"replay-ops", required_argument, 0, 256}, > > > {"record-ops", optional_argument, 0, 255}, > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > while ((ch = getopt_long(argc, argv, > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > longopts, NULL)) != EOF) > > > switch (ch) { > > > case 'b': > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > case 'g': > > > filldata = *optarg; > > > break; > > > + case 'h': > > > +#ifndef MADV_COLLAPSE > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > + "Can't support -h\n"); > > > + exit(86); > > > +#endif > > > + hugepages = 1; > > > + break; > > > > ... > > if we could change this part to: > > > > #ifdef MADV_COLLAPSE > > hugepages = 1; > > #endif > > break; > > > > to avoid the test failures on old systems. > > > > Or any better ideas from you :) > > Hi Zorro, > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > do you think about skipping generic/758 and generic/759 if the kernel > version is older than 6.1? That to me seems more preferable than the > paste above, as the paste above would execute the test as if it did > test hugepages when in reality it didn't, which would be misleading. Hi Joanne, Sorry I'm still on my holiday, reply a bit late and hastily. At first, your patch has been merged, the merged case numbers are g/759 and g/760, you can rebase to latest for-next branch and write new fix patch :) Then, you're right, above code change is bit rough;) Maybe there's a way to _notrun if MADV_COLLAPSE isn't supported? Thanks, Zorro > > > Thanks, > Joanne > > > > > Thanks, > > Zorro > > > > > case 'i': > > > integrity = 1; > > > logdev = strdup(optarg); > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > > > exit(95); > > > } > > > } > > > - original_buf = (char *) malloc(maxfilelen); > > > - for (i = 0; i < maxfilelen; i++) > > > - original_buf[i] = random() % 256; > > > - good_buf = (char *) malloc(maxfilelen + writebdy); > > > - good_buf = round_ptr_up(good_buf, writebdy, 0); > > > - memset(good_buf, '\0', maxfilelen); > > > - temp_buf = (char *) malloc(maxoplen + readbdy); > > > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > - memset(temp_buf, '\0', maxoplen); > > > + init_buffers(); > > > if (lite) { /* zero entire existing file */ > > > ssize_t written; > > > > > > -- > > > 2.47.1 > > > > > >
On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote: > > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote: > > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > > --- > > > > > > > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > and > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > +mapped writes DISABLED > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > > > > --- a/ltp/fsx.c > > > > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > > > > static struct option longopts[] = { > > > > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > > > > switch (ch) { > > > > > > > > > > > case 'b': > > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > > > > case 'g': > > > > > > > > > > > filldata = *optarg; > > > > > > > > > > > break; > > > > > > > > > > > + case 'h': > > > > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > > > > + "Can't support -h\n"); > > > > > > > > > > > + exit(86); > > > > > > > > > > > +#endif > > > > > > > > > > > + hugepages = 1; > > > > > > > > > > > + break; > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > if we could change this part to: > > > > > > > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > > > > hugepages = 1; > > > > > > > > > > #endif > > > > > > > > > > break; > > > > > > > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > > > > which requires that the underlying C library headers are new enough to > > > > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > > > > in a way that you hope is the same as... > > > > > > > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > > > > headers package; and the third way is for the program to define it > > > > > > > > itself if nobody else does. > > > > > > > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > > > > > > > Yes, because glibc provides the mmap() function that wraps > > > > > > syscall(__NR_mmap, ...); > > > > > > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > > > > headers package and support the feature. > > > > > > > > > > > > No, because some (most?) vendors backport new features into existing > > > > > > kernels without revving the version number of that kernel. > > > > > > > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > > > > > > > > > > Maybe the following fixes things? > > > > > > > > > > > > --D > > > > > > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > > > > the test will fail with: > > > > > > > > > > > > > QA output created by 760 > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > +mapped writes DISABLED > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > > > > kernel is newer than the rest of userspace. > > > > > > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > --- > > > > > > ltp/fsx.c | 1 + > > > > > > tests/generic/759 | 3 +++ > > > > > > tests/generic/760 | 3 +++ > > > > > > 3 files changed, 7 insertions(+) > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > > > > --- a/ltp/fsx.c > > > > > > +++ b/ltp/fsx.c > > > > > > @@ -20,6 +20,7 @@ > > > > > > #include <strings.h> > > > > > > #include <sys/file.h> > > > > > > #include <sys/mman.h> > > > > > > +#include <linux/mman.h> > > > > > > #include <sys/uio.h> > > > > > > #include <stdbool.h> > > > > > > #ifdef HAVE_ERR_H > > > > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > > > > --- a/tests/generic/759 > > > > > > +++ b/tests/generic/759 > > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > > > > _require_test > > > > > > _require_thp > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > + > > > > > > run_fsx -N 10000 -l 500000 -h > > > > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > > > > --- a/tests/generic/760 > > > > > > +++ b/tests/generic/760 > > > > > > @@ -15,6 +15,9 @@ _require_test > > > > > > _require_odirect > > > > > > _require_thp > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > + > > > > > > > > > > I tried this out locally and it works for me: > > > > > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > > > > Ran: generic/759 > > > > > Not run: generic/759 > > > > > Passed all 1 tests > > > > > > > > > > SECTION -- fuse > > > > > ========================= > > > > > Ran: generic/759 > > > > > Not run: generic/759 > > > > > Passed all 1 tests > > > > > > > > Does the test actually run if you /do/ have kernel/libc headers that > > > > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > > > > > > > > > > I'm not sure if I fully understand the subtext of your question but > > > yes, the test runs on my system when MADV_COLLAPSE is defined in the > > > kernel/libc headers. > > > > Yep, you understood me correctly. :) > > > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), > > > I tried this out by modifying the ifdef/ifndef checks in fsx to > > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | > > > grep -q 'MADV_COLLAPSE not supported'' line catches that. > > > > <nod> Sounds good then; I'll add this to my test clod and run it > > overnight. > > The arm64 vms with 64k base pages spat out this: > > --- /run/fstests/bin/tests/generic/759.out 2025-02-02 08:36:28.007248055 -0800 > +++ /var/tmp/fstests/generic/759.out.bad 2025-02-03 16:51:34.862964640 -0800 > @@ -1,4 +1,5 @@ > QA output created by 759 > fsx -N 10000 -l 500000 -h > -fsx -N 10000 -o 8192 -l 500000 -h > -fsx -N 10000 -o 128000 -l 500000 -h > +Seed set to 1 > +madvise collapse for buf: Cannot allocate memory > +init_hugepages_buf failed for good_buf: Cannot allocate memory > > Note that it was trying to create a 512M page(!) on a VM with 8G of > memory. Normally one doesn't use large base page size in low memory > environments, but this /is/ fstestsland. 8-) > > From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync > hugepage collapse") : > > ENOMEM Memory allocation failed or VMA not found > EBUSY Memcg charging failed > EAGAIN Required resource temporarily unavailable. Try again > might succeed. > EINVAL Other error: No PMD found, subpage doesn't have Present > bit set, "Special" page no backed by struct page, VMA > incorrectly sized, address not page-aligned, ... > > It sounds like ENOMEM/EBUSY/EINVAL should result in > _notrun "Could not generate hugepage" ? What are your thoughts? > Thanks for running it overnight. This sounds good to me, but will this be robust against false negatives? For example, if it succeeds when we're doing the $here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && _notrun "fsx binary does not support MADV_COLLAPSE" check, does that guarantee that the actual run won't error out with ENOMEM/EBUSY/EINVAL? It seems like there could be the case where it passes the check but then when it actually runs, the system state memory state may have changed and now memcg charging or the memory allocation fails? EAGAIN seems a bit iffy to me - hopefully this doesn't happen often but if it does, it would likely be a false negative fail for the generic test? Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && _notrun "fsx binary does not support MADV_COLLAPSE"", we should run the test and then if the output to the .out file is "MADV_COLLAPSE not supported", then we deem that a pass/success? Not sure if this is possible in the fstest infrastructure though for checking against two possible outputs in the .out file. What are your thoughts? Thanks, Joanne > --D > > > --D > > > > > > > > > --D > > > > > > > > > > > > > > Thanks, > > > > > Joanne > > > > > > > > > > > psize=`$here/src/feature -s` > > > > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > > > > > > > > > >
On Tue, Feb 4, 2025 at 9:05 AM Zorro Lang <zlang@redhat.com> wrote: > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > Add support for reads/writes from buffers backed by hugepages. > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > on systems where THP capabilities are enabled. > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > against problems like this in the future. > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > --- > > > > > > Those two test cases fail on old system which doesn't support > > > MADV_COLLAPSE: > > > > > > fsx -N 10000 -l 500000 -h > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > and > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > +mapped writes DISABLED > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > I'm wondering ... > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > index 41933354..3be383c6 100644 > > > > --- a/ltp/fsx.c > > > > +++ b/ltp/fsx.c > > > > @@ -190,6 +190,16 @@ int o_direct; /* -Z */ > > > > + > > > > static struct option longopts[] = { > > > > {"replay-ops", required_argument, 0, 256}, > > > > {"record-ops", optional_argument, 0, 255}, > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > longopts, NULL)) != EOF) > > > > switch (ch) { > > > > case 'b': > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > case 'g': > > > > filldata = *optarg; > > > > break; > > > > + case 'h': > > > > +#ifndef MADV_COLLAPSE > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > + "Can't support -h\n"); > > > > + exit(86); > > > > +#endif > > > > + hugepages = 1; > > > > + break; > > > > > > ... > > > if we could change this part to: > > > > > > #ifdef MADV_COLLAPSE > > > hugepages = 1; > > > #endif > > > break; > > > > > > to avoid the test failures on old systems. > > > > > > Or any better ideas from you :) > > > > Hi Zorro, > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > do you think about skipping generic/758 and generic/759 if the kernel > > version is older than 6.1? That to me seems more preferable than the > > paste above, as the paste above would execute the test as if it did > > test hugepages when in reality it didn't, which would be misleading. > > Hi Joanne, > > Sorry I'm still on my holiday, reply a bit late and hastily. At first, > your patch has been merged, the merged case numbers are g/759 and g/760, > you can rebase to latest for-next branch and write new fix patch :) Hi Zorro, No worries at all. Thanks for your work on this. > > Then, you're right, above code change is bit rough;) Maybe there's a way > to _notrun if MADV_COLLAPSE isn't supported? I'll (or Darrick if we go with his solution) make sure to submit a fix for this so that it doesn't break the older systems. Thanks, Joanne > > Thanks, > Zorro > > > > > > > Thanks, > > Joanne > > > > > > > > Thanks, > > > Zorro > > > > > > > case 'i': > > > > integrity = 1; > > > > logdev = strdup(optarg); > > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv) > > > > exit(95); > > > > } > > > > } > > > > - original_buf = (char *) malloc(maxfilelen); > > > > - for (i = 0; i < maxfilelen; i++) > > > > - original_buf[i] = random() % 256; > > > > - good_buf = (char *) malloc(maxfilelen + writebdy); > > > > - good_buf = round_ptr_up(good_buf, writebdy, 0); > > > > - memset(good_buf, '\0', maxfilelen); > > > > - temp_buf = (char *) malloc(maxoplen + readbdy); > > > > - temp_buf = round_ptr_up(temp_buf, readbdy, 0); > > > > - memset(temp_buf, '\0', maxoplen); > > > > + init_buffers(); > > > > if (lite) { /* zero entire existing file */ > > > > ssize_t written; > > > > > > > > -- > > > > 2.47.1 > > > > > > > > > >
On Tue, Feb 04, 2025 at 12:50:04PM -0800, Joanne Koong wrote: > On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote: > > > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote: > > > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > > > --- > > > > > > > > > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > > > and > > > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > +mapped writes DISABLED > > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > > > > > --- a/ltp/fsx.c > > > > > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > > > > > static struct option longopts[] = { > > > > > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > > > > > switch (ch) { > > > > > > > > > > > > case 'b': > > > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > > > > > case 'g': > > > > > > > > > > > > filldata = *optarg; > > > > > > > > > > > > break; > > > > > > > > > > > > + case 'h': > > > > > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > > > > > + "Can't support -h\n"); > > > > > > > > > > > > + exit(86); > > > > > > > > > > > > +#endif > > > > > > > > > > > > + hugepages = 1; > > > > > > > > > > > > + break; > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > if we could change this part to: > > > > > > > > > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > > > > > hugepages = 1; > > > > > > > > > > > #endif > > > > > > > > > > > break; > > > > > > > > > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > > > > > which requires that the underlying C library headers are new enough to > > > > > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > > > > > in a way that you hope is the same as... > > > > > > > > > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > > > > > headers package; and the third way is for the program to define it > > > > > > > > > itself if nobody else does. > > > > > > > > > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > > > > > > > > > Yes, because glibc provides the mmap() function that wraps > > > > > > > syscall(__NR_mmap, ...); > > > > > > > > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > > > > > headers package and support the feature. > > > > > > > > > > > > > > No, because some (most?) vendors backport new features into existing > > > > > > > kernels without revving the version number of that kernel. > > > > > > > > > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > > > > > > > > > > > > > Maybe the following fixes things? > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > > > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > > > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > > > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > > > > > the test will fail with: > > > > > > > > > > > > > > > QA output created by 760 > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > +mapped writes DISABLED > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > > > > > kernel is newer than the rest of userspace. > > > > > > > > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > > --- > > > > > > > ltp/fsx.c | 1 + > > > > > > > tests/generic/759 | 3 +++ > > > > > > > tests/generic/760 | 3 +++ > > > > > > > 3 files changed, 7 insertions(+) > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > > > > > --- a/ltp/fsx.c > > > > > > > +++ b/ltp/fsx.c > > > > > > > @@ -20,6 +20,7 @@ > > > > > > > #include <strings.h> > > > > > > > #include <sys/file.h> > > > > > > > #include <sys/mman.h> > > > > > > > +#include <linux/mman.h> > > > > > > > #include <sys/uio.h> > > > > > > > #include <stdbool.h> > > > > > > > #ifdef HAVE_ERR_H > > > > > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > > > > > --- a/tests/generic/759 > > > > > > > +++ b/tests/generic/759 > > > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > > > > > _require_test > > > > > > > _require_thp > > > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > > + > > > > > > > run_fsx -N 10000 -l 500000 -h > > > > > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > > > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > > > > > --- a/tests/generic/760 > > > > > > > +++ b/tests/generic/760 > > > > > > > @@ -15,6 +15,9 @@ _require_test > > > > > > > _require_odirect > > > > > > > _require_thp > > > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > > + > > > > > > > > > > > > I tried this out locally and it works for me: > > > > > > > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > > > > > Ran: generic/759 > > > > > > Not run: generic/759 > > > > > > Passed all 1 tests > > > > > > > > > > > > SECTION -- fuse > > > > > > ========================= > > > > > > Ran: generic/759 > > > > > > Not run: generic/759 > > > > > > Passed all 1 tests > > > > > > > > > > Does the test actually run if you /do/ have kernel/libc headers that > > > > > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > > > > > > > > > > > > > I'm not sure if I fully understand the subtext of your question but > > > > yes, the test runs on my system when MADV_COLLAPSE is defined in the > > > > kernel/libc headers. > > > > > > Yep, you understood me correctly. :) > > > > > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), > > > > I tried this out by modifying the ifdef/ifndef checks in fsx to > > > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | > > > > grep -q 'MADV_COLLAPSE not supported'' line catches that. > > > > > > <nod> Sounds good then; I'll add this to my test clod and run it > > > overnight. > > > > The arm64 vms with 64k base pages spat out this: > > > > --- /run/fstests/bin/tests/generic/759.out 2025-02-02 08:36:28.007248055 -0800 > > +++ /var/tmp/fstests/generic/759.out.bad 2025-02-03 16:51:34.862964640 -0800 > > @@ -1,4 +1,5 @@ > > QA output created by 759 > > fsx -N 10000 -l 500000 -h > > -fsx -N 10000 -o 8192 -l 500000 -h > > -fsx -N 10000 -o 128000 -l 500000 -h > > +Seed set to 1 > > +madvise collapse for buf: Cannot allocate memory > > +init_hugepages_buf failed for good_buf: Cannot allocate memory > > > > Note that it was trying to create a 512M page(!) on a VM with 8G of > > memory. Normally one doesn't use large base page size in low memory > > environments, but this /is/ fstestsland. 8-) > > > > From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync > > hugepage collapse") : > > > > ENOMEM Memory allocation failed or VMA not found > > EBUSY Memcg charging failed > > EAGAIN Required resource temporarily unavailable. Try again > > might succeed. > > EINVAL Other error: No PMD found, subpage doesn't have Present > > bit set, "Special" page no backed by struct page, VMA > > incorrectly sized, address not page-aligned, ... > > > > It sounds like ENOMEM/EBUSY/EINVAL should result in > > _notrun "Could not generate hugepage" ? What are your thoughts? > > > > Thanks for running it overnight. This sounds good to me, but will this > be robust against false negatives? For example, if it succeeds when > we're doing the > > $here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not > supported' && _notrun "fsx binary does not support MADV_COLLAPSE" > > check, does that guarantee that the actual run won't error out with > ENOMEM/EBUSY/EINVAL? Nope. That code only checks that the fsx binary was built with MADV_COLLAPSE support, not that any of it actually works. > It seems like there could be the case where it > passes the check but then when it actually runs, the system state > memory state may have changed and now memcg charging or the memory > allocation fails? Indeed, I think the error I saw is exactly this happening. In the end I turned the fsx -h invocation into a helper that checks for any of those three errors (ENOMEM/BUSY/INVAL) and _notruns the test if we can't even get a hugepage at the start. # Run fsx with -h(ugepage buffers). If we can't set up a hugepage then skip # the test, but if any other error occurs then exit the test. _run_hugepage_fsx() { _run_fsx "$@" -h &> $tmp.hugepage_fsx local res=$? if [ $res -eq 103 ]; then # According to the MADV_COLLAPSE manpage, these three errors # can happen if the kernel could not collapse a collection of # pages into a single huge page. grep -q -E ' for hugebuf: (Cannot allocate memory|Device or resource busy|Resource temporarily unavailable)' $tmp.hugepage_fsx && \ _notrun "Could not set up huge page for test" fi cat $tmp.hugepage_fsx rm -f $tmp.hugepage_fsx test $res -ne 0 && exit 1 return 0 } Note that it won't prevent or paper over subsequent MADV_COLLAPSE failures once the process is up and running, though as long as nothing else in fsx fails or corrupts the file, the collapse failures won't be reported. (As is the case now). Anyway I'll send patches for both issues. > allocation fails? EAGAIN seems a bit iffy to me - hopefully this > doesn't happen often but if it does, it would likely be a false > negative fail for the generic test? > > Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q > 'MADV_COLLAPSE not supported' && _notrun "fsx binary does not support > MADV_COLLAPSE"", we should run the test and then if the output to the > .out file is "MADV_COLLAPSE not supported", then we deem that a > pass/success? Not sure if this is possible in the fstest > infrastructure though for checking against two possible outputs in the > .out file. What are your thoughts? You ... can switch the .out(put) dynamically, but the mechanism to do it is underdocumented and quirky; and it makes understanding what the test does rather difficult. See _link_out_file in xfs/614 if you want to open that Pandora's box. --D > > > Thanks, > Joanne > > > --D > > > > > --D > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Thanks, > > > > > > Joanne > > > > > > > > > > > > > psize=`$here/src/feature -s` > > > > > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > > > > > > > > > > > > > >
On Tue, Feb 4, 2025 at 1:31 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Tue, Feb 04, 2025 at 12:50:04PM -0800, Joanne Koong wrote: > > On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote: > > > > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote: > > > > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote: > > > > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote: > > > > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote: > > > > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote: > > > > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages. > > > > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used > > > > > > > > > > > > > on systems where THP capabilities are enabled. > > > > > > > > > > > > > > > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of > > > > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation > > > > > > > > > > > > > against problems like this in the future. > > > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com> > > > > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com> > > > > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org> > > > > > > > > > > > > > --- > > > > > > > > > > > > > > > > > > > > > > > > Those two test cases fail on old system which doesn't support > > > > > > > > > > > > MADV_COLLAPSE: > > > > > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -h > > > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > > > > > and > > > > > > > > > > > > > > > > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > > > > +mapped writes DISABLED > > > > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > > > > > > > > > I'm wondering ... > > > > > > > > > > > > > > > > > > > > > > > > > ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > > > > > > 1 file changed, 153 insertions(+), 13 deletions(-) > > > > > > > > > > > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > > > > > > index 41933354..3be383c6 100644 > > > > > > > > > > > > > --- a/ltp/fsx.c > > > > > > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > > > > > > static struct option longopts[] = { > > > > > > > > > > > > > {"replay-ops", required_argument, 0, 256}, > > > > > > > > > > > > > {"record-ops", optional_argument, 0, 255}, > > > > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv) > > > > > > > > > > > > > setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ > > > > > > > > > > > > > > > > > > > > > > > > > > while ((ch = getopt_long(argc, argv, > > > > > > > > > > > > > - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > > > + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", > > > > > > > > > > > > > longopts, NULL)) != EOF) > > > > > > > > > > > > > switch (ch) { > > > > > > > > > > > > > case 'b': > > > > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv) > > > > > > > > > > > > > case 'g': > > > > > > > > > > > > > filldata = *optarg; > > > > > > > > > > > > > break; > > > > > > > > > > > > > + case 'h': > > > > > > > > > > > > > +#ifndef MADV_COLLAPSE > > > > > > > > > > > > > + fprintf(stderr, "MADV_COLLAPSE not supported. " > > > > > > > > > > > > > + "Can't support -h\n"); > > > > > > > > > > > > > + exit(86); > > > > > > > > > > > > > +#endif > > > > > > > > > > > > > + hugepages = 1; > > > > > > > > > > > > > + break; > > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > if we could change this part to: > > > > > > > > > > > > > > > > > > > > > > > > #ifdef MADV_COLLAPSE > > > > > > > > > > > > hugepages = 1; > > > > > > > > > > > > #endif > > > > > > > > > > > > break; > > > > > > > > > > > > > > > > > > > > > > > > to avoid the test failures on old systems. > > > > > > > > > > > > > > > > > > > > > > > > Or any better ideas from you :) > > > > > > > > > > > > > > > > > > > > > > Hi Zorro, > > > > > > > > > > > > > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What > > > > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel > > > > > > > > > > > version is older than 6.1? That to me seems more preferable than the > > > > > > > > > > > paste above, as the paste above would execute the test as if it did > > > > > > > > > > > test hugepages when in reality it didn't, which would be misleading. > > > > > > > > > > > > > > > > > > > > Now that I've gotten to try this out -- > > > > > > > > > > > > > > > > > > > > There's a couple of things going on here. The first is that generic/759 > > > > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the > > > > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test. > > > > > > > > > > > > > > > > > > > > The second thing is that userspace programs can ensure the existence of > > > > > > > > > > MADV_COLLAPSE in multiple ways. The first way is through sys/mman.h, > > > > > > > > > > which requires that the underlying C library headers are new enough to > > > > > > > > > > include a definition. glibc 2.37 is new enough, but even things like > > > > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that. Other C libraries > > > > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols > > > > > > > > > > in a way that you hope is the same as... > > > > > > > > > > > > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel > > > > > > > > > > headers package; and the third way is for the program to define it > > > > > > > > > > itself if nobody else does. > > > > > > > > > > > > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include > > > > > > > > > > linux/mman.h in addition to sys/mman.h. Sorry I didn't notice these > > > > > > > > > > > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added? > > > > > > > > > > > > > > > > Yes, because glibc provides the mmap() function that wraps > > > > > > > > syscall(__NR_mmap, ...); > > > > > > > > > > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the > > > > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that > > > > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel > > > > > > > > > headers package and support the feature. > > > > > > > > > > > > > > > > No, because some (most?) vendors backport new features into existing > > > > > > > > kernels without revving the version number of that kernel. > > > > > > > > > > > > > > Oh okay, I see. That makes sense, thanks for the explanation. > > > > > > > > > > > > > > > > > > > > > > > Maybe the following fixes things? > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion > > > > > > > > > > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the > > > > > > > > MADV_COLLAPSE flag might not be defined in any of the header files > > > > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get > > > > > > > > built with any of the MADV_COLLAPSE code. If the kernel supports THP, > > > > > > > > the test will fail with: > > > > > > > > > > > > > > > > > QA output created by 760 > > > > > > > > > fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h > > > > > > > > > +mapped writes DISABLED > > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h > > > > > > > > > > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE, > > > > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka > > > > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the > > > > > > > > kernel is newer than the rest of userspace. > > > > > > > > > > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02 > > > > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers") > > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> > > > > > > > > --- > > > > > > > > ltp/fsx.c | 1 + > > > > > > > > tests/generic/759 | 3 +++ > > > > > > > > tests/generic/760 | 3 +++ > > > > > > > > 3 files changed, 7 insertions(+) > > > > > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c > > > > > > > > index 634c496ffe9317..cf9502a74c17a7 100644 > > > > > > > > --- a/ltp/fsx.c > > > > > > > > +++ b/ltp/fsx.c > > > > > > > > @@ -20,6 +20,7 @@ > > > > > > > > #include <strings.h> > > > > > > > > #include <sys/file.h> > > > > > > > > #include <sys/mman.h> > > > > > > > > +#include <linux/mman.h> > > > > > > > > #include <sys/uio.h> > > > > > > > > #include <stdbool.h> > > > > > > > > #ifdef HAVE_ERR_H > > > > > > > > diff --git a/tests/generic/759 b/tests/generic/759 > > > > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755 > > > > > > > > --- a/tests/generic/759 > > > > > > > > +++ b/tests/generic/759 > > > > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick > > > > > > > > _require_test > > > > > > > > _require_thp > > > > > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > > > + > > > > > > > > run_fsx -N 10000 -l 500000 -h > > > > > > > > run_fsx -N 10000 -o 8192 -l 500000 -h > > > > > > > > run_fsx -N 10000 -o 128000 -l 500000 -h > > > > > > > > diff --git a/tests/generic/760 b/tests/generic/760 > > > > > > > > index c71a196222ad3b..2fbd28502ae678 100755 > > > > > > > > --- a/tests/generic/760 > > > > > > > > +++ b/tests/generic/760 > > > > > > > > @@ -15,6 +15,9 @@ _require_test > > > > > > > > _require_odirect > > > > > > > > _require_thp > > > > > > > > > > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \ > > > > > > > > + _notrun "fsx binary does not support MADV_COLLAPSE" > > > > > > > > + > > > > > > > > > > > > > > I tried this out locally and it works for me: > > > > > > > > > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE > > > > > > > Ran: generic/759 > > > > > > > Not run: generic/759 > > > > > > > Passed all 1 tests > > > > > > > > > > > > > > SECTION -- fuse > > > > > > > ========================= > > > > > > > Ran: generic/759 > > > > > > > Not run: generic/759 > > > > > > > Passed all 1 tests > > > > > > > > > > > > Does the test actually run if you /do/ have kernel/libc headers that > > > > > > define MADV_COLLAPSE? And if so, does that count as a Tested-by? > > > > > > > > > > > > > > > > I'm not sure if I fully understand the subtext of your question but > > > > > yes, the test runs on my system when MADV_COLLAPSE is defined in the > > > > > kernel/libc headers. > > > > > > > > Yep, you understood me correctly. :) > > > > > > > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined), > > > > > I tried this out by modifying the ifdef/ifndef checks in fsx to > > > > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | > > > > > grep -q 'MADV_COLLAPSE not supported'' line catches that. > > > > > > > > <nod> Sounds good then; I'll add this to my test clod and run it > > > > overnight. > > > > > > The arm64 vms with 64k base pages spat out this: > > > > > > --- /run/fstests/bin/tests/generic/759.out 2025-02-02 08:36:28.007248055 -0800 > > > +++ /var/tmp/fstests/generic/759.out.bad 2025-02-03 16:51:34.862964640 -0800 > > > @@ -1,4 +1,5 @@ > > > QA output created by 759 > > > fsx -N 10000 -l 500000 -h > > > -fsx -N 10000 -o 8192 -l 500000 -h > > > -fsx -N 10000 -o 128000 -l 500000 -h > > > +Seed set to 1 > > > +madvise collapse for buf: Cannot allocate memory > > > +init_hugepages_buf failed for good_buf: Cannot allocate memory > > > > > > Note that it was trying to create a 512M page(!) on a VM with 8G of > > > memory. Normally one doesn't use large base page size in low memory > > > environments, but this /is/ fstestsland. 8-) > > > > > > From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync > > > hugepage collapse") : > > > > > > ENOMEM Memory allocation failed or VMA not found > > > EBUSY Memcg charging failed > > > EAGAIN Required resource temporarily unavailable. Try again > > > might succeed. > > > EINVAL Other error: No PMD found, subpage doesn't have Present > > > bit set, "Special" page no backed by struct page, VMA > > > incorrectly sized, address not page-aligned, ... > > > > > > It sounds like ENOMEM/EBUSY/EINVAL should result in > > > _notrun "Could not generate hugepage" ? What are your thoughts? > > > > > > > Thanks for running it overnight. This sounds good to me, but will this > > be robust against false negatives? For example, if it succeeds when > > we're doing the > > > > $here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not > > supported' && _notrun "fsx binary does not support MADV_COLLAPSE" > > > > check, does that guarantee that the actual run won't error out with > > ENOMEM/EBUSY/EINVAL? > > Nope. That code only checks that the fsx binary was built with > MADV_COLLAPSE support, not that any of it actually works. > > > It seems like there could be the case where it > > passes the check but then when it actually runs, the system state > > memory state may have changed and now memcg charging or the memory > > allocation fails? > > Indeed, I think the error I saw is exactly this happening. > > In the end I turned the fsx -h invocation into a helper that checks for > any of those three errors (ENOMEM/BUSY/INVAL) and _notruns the test if > we can't even get a hugepage at the start. > > # Run fsx with -h(ugepage buffers). If we can't set up a hugepage then skip > # the test, but if any other error occurs then exit the test. > _run_hugepage_fsx() { > _run_fsx "$@" -h &> $tmp.hugepage_fsx > local res=$? > if [ $res -eq 103 ]; then > # According to the MADV_COLLAPSE manpage, these three errors > # can happen if the kernel could not collapse a collection of > # pages into a single huge page. > grep -q -E ' for hugebuf: (Cannot allocate memory|Device or resource busy|Resource temporarily unavailable)' $tmp.hugepage_fsx && \ > _notrun "Could not set up huge page for test" > fi > cat $tmp.hugepage_fsx > rm -f $tmp.hugepage_fsx > test $res -ne 0 && exit 1 > return 0 > } Awesome, thank you! > > Note that it won't prevent or paper over subsequent MADV_COLLAPSE > failures once the process is up and running, though as long as nothing > else in fsx fails or corrupts the file, the collapse failures won't be > reported. (As is the case now). > > Anyway I'll send patches for both issues. > > > allocation fails? EAGAIN seems a bit iffy to me - hopefully this > > doesn't happen often but if it does, it would likely be a false > > negative fail for the generic test? > > > > Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q > > 'MADV_COLLAPSE not supported' && _notrun "fsx binary does not support > > MADV_COLLAPSE"", we should run the test and then if the output to the > > .out file is "MADV_COLLAPSE not supported", then we deem that a > > pass/success? Not sure if this is possible in the fstest > > infrastructure though for checking against two possible outputs in the > > .out file. What are your thoughts? > > You ... can switch the .out(put) dynamically, but the mechanism to do it > is underdocumented and quirky; and it makes understanding what the test > does rather difficult. See _link_out_file in xfs/614 if you want to > open that Pandora's box. > > --D > > > > > > > Thanks, > > Joanne > > > > > --D > > > > > > > --D > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > Thanks, > > > > > > > Joanne > > > > > > > > > > > > > > > psize=`$here/src/feature -s` > > > > > > > > bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV` > > > > > > > > > > > > > > > > > > > > >
diff --git a/ltp/fsx.c b/ltp/fsx.c index 41933354..3be383c6 100644 --- a/ltp/fsx.c +++ b/ltp/fsx.c @@ -190,6 +190,16 @@ int o_direct; /* -Z */ int aio = 0; int uring = 0; int mark_nr = 0; +int hugepages = 0; /* -h flag */ + +/* Stores info needed to periodically collapse hugepages */ +struct hugepages_collapse_info { + void *orig_good_buf; + long good_buf_size; + void *orig_temp_buf; + long temp_buf_size; +}; +struct hugepages_collapse_info hugepages_info; int page_size; int page_mask; @@ -2471,7 +2481,7 @@ void usage(void) { fprintf(stdout, "usage: %s", - "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\ + "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\ [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\ [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\ [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\ @@ -2483,8 +2493,11 @@ usage(void) -d: debug output for all operations\n\ -e: pollute post-eof on size changes (default 0)\n\ -f: flush and invalidate cache after I/O\n\ - -g X: write character X instead of random generated data\n\ - -i logdev: do integrity testing, logdev is the dm log writes device\n\ + -g X: write character X instead of random generated data\n" +#ifdef MADV_COLLAPSE +" -h hugepages: use buffers backed by hugepages for reads/writes\n" +#endif +" -i logdev: do integrity testing, logdev is the dm log writes device\n\ -j logid: prefix debug log messsages with this id\n\ -k: do not truncate existing file and use its size as upper bound on file size\n\ -l flen: the upper bound on file size (default 262144)\n\ @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str) #endif } +/* + * Reclaim may break up hugepages, so do a best-effort collapse every once in + * a while. + */ +static void +collapse_hugepages(void) +{ +#ifdef MADV_COLLAPSE + int ret; + + /* re-collapse every 16k fsxops after we start */ + if (!numops || (numops & ((1U << 14) - 1))) + return; + + ret = madvise(hugepages_info.orig_good_buf, + hugepages_info.good_buf_size, MADV_COLLAPSE); + if (ret) + prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n", + numops, strerror(errno)); + ret = madvise(hugepages_info.orig_temp_buf, + hugepages_info.temp_buf_size, MADV_COLLAPSE); + if (ret) + prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n", + numops, strerror(errno)); +#endif +} + bool keep_running(void) { int ret; + if (hugepages) + collapse_hugepages(); + if (deadline.tv_nsec) { struct timespec now; @@ -2856,6 +2899,103 @@ keep_running(void) return numops-- != 0; } +static long +get_hugepage_size(void) +{ + const char str[] = "Hugepagesize:"; + size_t str_len = sizeof(str) - 1; + unsigned int hugepage_size = 0; + char buffer[64]; + FILE *file; + + file = fopen("/proc/meminfo", "r"); + if (!file) { + prterr("get_hugepage_size: fopen /proc/meminfo"); + return -1; + } + while (fgets(buffer, sizeof(buffer), file)) { + if (strncmp(buffer, str, str_len) == 0) { + sscanf(buffer + str_len, "%u", &hugepage_size); + break; + } + } + fclose(file); + if (!hugepage_size) { + prterr("get_hugepage_size: failed to find " + "hugepage size in /proc/meminfo\n"); + return -1; + } + + /* convert from KiB to bytes */ + return hugepage_size << 10; +} + +static void * +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size) +{ + void *buf = NULL; +#ifdef MADV_COLLAPSE + int ret; + long size = roundup(len, hugepage_size) + alignment; + + ret = posix_memalign(&buf, hugepage_size, size); + if (ret) { + prterr("posix_memalign for buf"); + return NULL; + } + memset(buf, '\0', size); + ret = madvise(buf, size, MADV_COLLAPSE); + if (ret) { + prterr("madvise collapse for buf"); + free(buf); + return NULL; + } + + *buf_size = size; +#endif + return buf; +} + +static void +init_buffers(void) +{ + int i; + + original_buf = (char *) malloc(maxfilelen); + for (i = 0; i < maxfilelen; i++) + original_buf[i] = random() % 256; + if (hugepages) { + long hugepage_size = get_hugepage_size(); + if (hugepage_size == -1) { + prterr("get_hugepage_size()"); + exit(102); + } + good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy, + &hugepages_info.good_buf_size); + if (!good_buf) { + prterr("init_hugepages_buf failed for good_buf"); + exit(103); + } + hugepages_info.orig_good_buf = good_buf; + + temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy, + &hugepages_info.temp_buf_size); + if (!temp_buf) { + prterr("init_hugepages_buf failed for temp_buf"); + exit(103); + } + hugepages_info.orig_temp_buf = temp_buf; + } else { + unsigned long good_buf_len = maxfilelen + writebdy; + unsigned long temp_buf_len = maxoplen + readbdy; + + good_buf = calloc(1, good_buf_len); + temp_buf = calloc(1, temp_buf_len); + } + good_buf = round_ptr_up(good_buf, writebdy, 0); + temp_buf = round_ptr_up(temp_buf, readbdy, 0); +} + static struct option longopts[] = { {"replay-ops", required_argument, 0, 256}, {"record-ops", optional_argument, 0, 255}, @@ -2883,7 +3023,7 @@ main(int argc, char **argv) setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ while ((ch = getopt_long(argc, argv, - "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", + "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ", longopts, NULL)) != EOF) switch (ch) { case 'b': @@ -2916,6 +3056,14 @@ main(int argc, char **argv) case 'g': filldata = *optarg; break; + case 'h': +#ifndef MADV_COLLAPSE + fprintf(stderr, "MADV_COLLAPSE not supported. " + "Can't support -h\n"); + exit(86); +#endif + hugepages = 1; + break; case 'i': integrity = 1; logdev = strdup(optarg); @@ -3229,15 +3377,7 @@ main(int argc, char **argv) exit(95); } } - original_buf = (char *) malloc(maxfilelen); - for (i = 0; i < maxfilelen; i++) - original_buf[i] = random() % 256; - good_buf = (char *) malloc(maxfilelen + writebdy); - good_buf = round_ptr_up(good_buf, writebdy, 0); - memset(good_buf, '\0', maxfilelen); - temp_buf = (char *) malloc(maxoplen + readbdy); - temp_buf = round_ptr_up(temp_buf, readbdy, 0); - memset(temp_buf, '\0', maxoplen); + init_buffers(); if (lite) { /* zero entire existing file */ ssize_t written;