diff mbox series

[v5,1/2] fsx: support reads/writes from buffers backed by hugepages

Message ID 20250121215641.1764359-2-joannelkoong@gmail.com (mailing list archive)
State New
Headers show
Series fstests: test reads/writes from hugepages-backed buffers | expand

Commit Message

Joanne Koong Jan. 21, 2025, 9:56 p.m. UTC
Add support for reads/writes from buffers backed by hugepages.
This can be enabled through the '-h' flag. This flag should only be used
on systems where THP capabilities are enabled.

This is motivated by a recent bug that was due to faulty handling of
userspace buffers backed by hugepages. This patch is a mitigation
against problems like this in the future.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 153 insertions(+), 13 deletions(-)

Comments

Zorro Lang Feb. 2, 2025, 2:25 p.m. UTC | #1
On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> Add support for reads/writes from buffers backed by hugepages.
> This can be enabled through the '-h' flag. This flag should only be used
> on systems where THP capabilities are enabled.
> 
> This is motivated by a recent bug that was due to faulty handling of
> userspace buffers backed by hugepages. This patch is a mitigation
> against problems like this in the future.
> 
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> ---

Those two test cases fail on old system which doesn't support
MADV_COLLAPSE:

   fsx -N 10000 -l 500000 -h
  -fsx -N 10000 -o 8192 -l 500000 -h
  -fsx -N 10000 -o 128000 -l 500000 -h
  +MADV_COLLAPSE not supported. Can't support -h

and

   fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
  -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
  -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
  +mapped writes DISABLED
  +MADV_COLLAPSE not supported. Can't support -h

I'm wondering ...

>  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 153 insertions(+), 13 deletions(-)
> 
> diff --git a/ltp/fsx.c b/ltp/fsx.c
> index 41933354..3be383c6 100644
> --- a/ltp/fsx.c
> +++ b/ltp/fsx.c
> @@ -190,6 +190,16 @@ int	o_direct;			/* -Z */
>  int	aio = 0;
>  int	uring = 0;
>  int	mark_nr = 0;
> +int	hugepages = 0;                  /* -h flag */
> +
> +/* Stores info needed to periodically collapse hugepages */
> +struct hugepages_collapse_info {
> +	void *orig_good_buf;
> +	long good_buf_size;
> +	void *orig_temp_buf;
> +	long temp_buf_size;
> +};
> +struct hugepages_collapse_info hugepages_info;
>  
>  int page_size;
>  int page_mask;
> @@ -2471,7 +2481,7 @@ void
>  usage(void)
>  {
>  	fprintf(stdout, "usage: %s",
> -		"fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> +		"fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
>  	   [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
>  	   [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
>  	   [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> @@ -2483,8 +2493,11 @@ usage(void)
>  	-d: debug output for all operations\n\
>  	-e: pollute post-eof on size changes (default 0)\n\
>  	-f: flush and invalidate cache after I/O\n\
> -	-g X: write character X instead of random generated data\n\
> -	-i logdev: do integrity testing, logdev is the dm log writes device\n\
> +	-g X: write character X instead of random generated data\n"
> +#ifdef MADV_COLLAPSE
> +"	-h hugepages: use buffers backed by hugepages for reads/writes\n"
> +#endif
> +"	-i logdev: do integrity testing, logdev is the dm log writes device\n\
>  	-j logid: prefix debug log messsages with this id\n\
>  	-k: do not truncate existing file and use its size as upper bound on file size\n\
>  	-l flen: the upper bound on file size (default 262144)\n\
> @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str)
>  #endif
>  }
>  
> +/*
> + * Reclaim may break up hugepages, so do a best-effort collapse every once in
> + * a while.
> + */
> +static void
> +collapse_hugepages(void)
> +{
> +#ifdef MADV_COLLAPSE
> +	int ret;
> +
> +	/* re-collapse every 16k fsxops after we start */
> +	if (!numops || (numops & ((1U << 14) - 1)))
> +		return;
> +
> +	ret = madvise(hugepages_info.orig_good_buf,
> +		      hugepages_info.good_buf_size, MADV_COLLAPSE);
> +	if (ret)
> +		prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
> +		     numops, strerror(errno));
> +	ret = madvise(hugepages_info.orig_temp_buf,
> +		      hugepages_info.temp_buf_size, MADV_COLLAPSE);
> +	if (ret)
> +		prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
> +		     numops, strerror(errno));
> +#endif
> +}
> +
>  bool
>  keep_running(void)
>  {
>  	int ret;
>  
> +	if (hugepages)
> +	        collapse_hugepages();
> +
>  	if (deadline.tv_nsec) {
>  		struct timespec now;
>  
> @@ -2856,6 +2899,103 @@ keep_running(void)
>  	return numops-- != 0;
>  }
>  
> +static long
> +get_hugepage_size(void)
> +{
> +	const char str[] = "Hugepagesize:";
> +	size_t str_len =  sizeof(str) - 1;
> +	unsigned int hugepage_size = 0;
> +	char buffer[64];
> +	FILE *file;
> +
> +	file = fopen("/proc/meminfo", "r");
> +	if (!file) {
> +		prterr("get_hugepage_size: fopen /proc/meminfo");
> +		return -1;
> +	}
> +	while (fgets(buffer, sizeof(buffer), file)) {
> +		if (strncmp(buffer, str, str_len) == 0) {
> +			sscanf(buffer + str_len, "%u", &hugepage_size);
> +			break;
> +		}
> +	}
> +	fclose(file);
> +	if (!hugepage_size) {
> +		prterr("get_hugepage_size: failed to find "
> +			"hugepage size in /proc/meminfo\n");
> +		return -1;
> +	}
> +
> +	/* convert from KiB to bytes */
> +	return hugepage_size << 10;
> +}
> +
> +static void *
> +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
> +{
> +	void *buf = NULL;
> +#ifdef MADV_COLLAPSE
> +	int ret;
> +	long size = roundup(len, hugepage_size) + alignment;
> +
> +	ret = posix_memalign(&buf, hugepage_size, size);
> +	if (ret) {
> +		prterr("posix_memalign for buf");
> +		return NULL;
> +	}
> +	memset(buf, '\0', size);
> +	ret = madvise(buf, size, MADV_COLLAPSE);
> +	if (ret) {
> +		prterr("madvise collapse for buf");
> +		free(buf);
> +		return NULL;
> +	}
> +
> +	*buf_size = size;
> +#endif
> +	return buf;
> +}
> +
> +static void
> +init_buffers(void)
> +{
> +	int i;
> +
> +	original_buf = (char *) malloc(maxfilelen);
> +	for (i = 0; i < maxfilelen; i++)
> +		original_buf[i] = random() % 256;
> +	if (hugepages) {
> +		long hugepage_size = get_hugepage_size();
> +		if (hugepage_size == -1) {
> +			prterr("get_hugepage_size()");
> +			exit(102);
> +		}
> +		good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
> +					      &hugepages_info.good_buf_size);
> +		if (!good_buf) {
> +			prterr("init_hugepages_buf failed for good_buf");
> +			exit(103);
> +		}
> +		hugepages_info.orig_good_buf = good_buf;
> +
> +		temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
> +					      &hugepages_info.temp_buf_size);
> +		if (!temp_buf) {
> +			prterr("init_hugepages_buf failed for temp_buf");
> +			exit(103);
> +		}
> +		hugepages_info.orig_temp_buf = temp_buf;
> +	} else {
> +		unsigned long good_buf_len = maxfilelen + writebdy;
> +		unsigned long temp_buf_len = maxoplen + readbdy;
> +
> +		good_buf = calloc(1, good_buf_len);
> +		temp_buf = calloc(1, temp_buf_len);
> +	}
> +	good_buf = round_ptr_up(good_buf, writebdy, 0);
> +	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> +}
> +
>  static struct option longopts[] = {
>  	{"replay-ops", required_argument, 0, 256},
>  	{"record-ops", optional_argument, 0, 255},
> @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
>  	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
>  
>  	while ((ch = getopt_long(argc, argv,
> -				 "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> +				 "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
>  				 longopts, NULL)) != EOF)
>  		switch (ch) {
>  		case 'b':
> @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
>  		case 'g':
>  			filldata = *optarg;
>  			break;
> +		case 'h':
> +#ifndef MADV_COLLAPSE
> +			fprintf(stderr, "MADV_COLLAPSE not supported. "
> +				"Can't support -h\n");
> +			exit(86);
> +#endif
> +			hugepages = 1;
> +			break;

...
if we could change this part to:

#ifdef MADV_COLLAPSE
	hugepages = 1;
#endif
	break;

to avoid the test failures on old systems.

Or any better ideas from you :)

Thanks,
Zorro

>  		case 'i':
>  			integrity = 1;
>  			logdev = strdup(optarg);
> @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
>  			exit(95);
>  		}
>  	}
> -	original_buf = (char *) malloc(maxfilelen);
> -	for (i = 0; i < maxfilelen; i++)
> -		original_buf[i] = random() % 256;
> -	good_buf = (char *) malloc(maxfilelen + writebdy);
> -	good_buf = round_ptr_up(good_buf, writebdy, 0);
> -	memset(good_buf, '\0', maxfilelen);
> -	temp_buf = (char *) malloc(maxoplen + readbdy);
> -	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> -	memset(temp_buf, '\0', maxoplen);
> +	init_buffers();
>  	if (lite) {	/* zero entire existing file */
>  		ssize_t written;
>  
> -- 
> 2.47.1
>
Joanne Koong Feb. 3, 2025, 6:04 p.m. UTC | #2
On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
>
> On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > Add support for reads/writes from buffers backed by hugepages.
> > This can be enabled through the '-h' flag. This flag should only be used
> > on systems where THP capabilities are enabled.
> >
> > This is motivated by a recent bug that was due to faulty handling of
> > userspace buffers backed by hugepages. This patch is a mitigation
> > against problems like this in the future.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > ---
>
> Those two test cases fail on old system which doesn't support
> MADV_COLLAPSE:
>
>    fsx -N 10000 -l 500000 -h
>   -fsx -N 10000 -o 8192 -l 500000 -h
>   -fsx -N 10000 -o 128000 -l 500000 -h
>   +MADV_COLLAPSE not supported. Can't support -h
>
> and
>
>    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   +mapped writes DISABLED
>   +MADV_COLLAPSE not supported. Can't support -h
>
> I'm wondering ...
>
> >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 153 insertions(+), 13 deletions(-)
> >
> > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > index 41933354..3be383c6 100644
> > --- a/ltp/fsx.c
> > +++ b/ltp/fsx.c
> > @@ -190,6 +190,16 @@ int      o_direct;                       /* -Z */
> >  int  aio = 0;
> >  int  uring = 0;
> >  int  mark_nr = 0;
> > +int  hugepages = 0;                  /* -h flag */
> > +
> > +/* Stores info needed to periodically collapse hugepages */
> > +struct hugepages_collapse_info {
> > +     void *orig_good_buf;
> > +     long good_buf_size;
> > +     void *orig_temp_buf;
> > +     long temp_buf_size;
> > +};
> > +struct hugepages_collapse_info hugepages_info;
> >
> >  int page_size;
> >  int page_mask;
> > @@ -2471,7 +2481,7 @@ void
> >  usage(void)
> >  {
> >       fprintf(stdout, "usage: %s",
> > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > @@ -2483,8 +2493,11 @@ usage(void)
> >       -d: debug output for all operations\n\
> >       -e: pollute post-eof on size changes (default 0)\n\
> >       -f: flush and invalidate cache after I/O\n\
> > -     -g X: write character X instead of random generated data\n\
> > -     -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > +     -g X: write character X instead of random generated data\n"
> > +#ifdef MADV_COLLAPSE
> > +"    -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > +#endif
> > +"    -i logdev: do integrity testing, logdev is the dm log writes device\n\
> >       -j logid: prefix debug log messsages with this id\n\
> >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> >       -l flen: the upper bound on file size (default 262144)\n\
> > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str)
> >  #endif
> >  }
> >
> > +/*
> > + * Reclaim may break up hugepages, so do a best-effort collapse every once in
> > + * a while.
> > + */
> > +static void
> > +collapse_hugepages(void)
> > +{
> > +#ifdef MADV_COLLAPSE
> > +     int ret;
> > +
> > +     /* re-collapse every 16k fsxops after we start */
> > +     if (!numops || (numops & ((1U << 14) - 1)))
> > +             return;
> > +
> > +     ret = madvise(hugepages_info.orig_good_buf,
> > +                   hugepages_info.good_buf_size, MADV_COLLAPSE);
> > +     if (ret)
> > +             prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
> > +                  numops, strerror(errno));
> > +     ret = madvise(hugepages_info.orig_temp_buf,
> > +                   hugepages_info.temp_buf_size, MADV_COLLAPSE);
> > +     if (ret)
> > +             prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
> > +                  numops, strerror(errno));
> > +#endif
> > +}
> > +
> >  bool
> >  keep_running(void)
> >  {
> >       int ret;
> >
> > +     if (hugepages)
> > +             collapse_hugepages();
> > +
> >       if (deadline.tv_nsec) {
> >               struct timespec now;
> >
> > @@ -2856,6 +2899,103 @@ keep_running(void)
> >       return numops-- != 0;
> >  }
> >
> > +static long
> > +get_hugepage_size(void)
> > +{
> > +     const char str[] = "Hugepagesize:";
> > +     size_t str_len =  sizeof(str) - 1;
> > +     unsigned int hugepage_size = 0;
> > +     char buffer[64];
> > +     FILE *file;
> > +
> > +     file = fopen("/proc/meminfo", "r");
> > +     if (!file) {
> > +             prterr("get_hugepage_size: fopen /proc/meminfo");
> > +             return -1;
> > +     }
> > +     while (fgets(buffer, sizeof(buffer), file)) {
> > +             if (strncmp(buffer, str, str_len) == 0) {
> > +                     sscanf(buffer + str_len, "%u", &hugepage_size);
> > +                     break;
> > +             }
> > +     }
> > +     fclose(file);
> > +     if (!hugepage_size) {
> > +             prterr("get_hugepage_size: failed to find "
> > +                     "hugepage size in /proc/meminfo\n");
> > +             return -1;
> > +     }
> > +
> > +     /* convert from KiB to bytes */
> > +     return hugepage_size << 10;
> > +}
> > +
> > +static void *
> > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
> > +{
> > +     void *buf = NULL;
> > +#ifdef MADV_COLLAPSE
> > +     int ret;
> > +     long size = roundup(len, hugepage_size) + alignment;
> > +
> > +     ret = posix_memalign(&buf, hugepage_size, size);
> > +     if (ret) {
> > +             prterr("posix_memalign for buf");
> > +             return NULL;
> > +     }
> > +     memset(buf, '\0', size);
> > +     ret = madvise(buf, size, MADV_COLLAPSE);
> > +     if (ret) {
> > +             prterr("madvise collapse for buf");
> > +             free(buf);
> > +             return NULL;
> > +     }
> > +
> > +     *buf_size = size;
> > +#endif
> > +     return buf;
> > +}
> > +
> > +static void
> > +init_buffers(void)
> > +{
> > +     int i;
> > +
> > +     original_buf = (char *) malloc(maxfilelen);
> > +     for (i = 0; i < maxfilelen; i++)
> > +             original_buf[i] = random() % 256;
> > +     if (hugepages) {
> > +             long hugepage_size = get_hugepage_size();
> > +             if (hugepage_size == -1) {
> > +                     prterr("get_hugepage_size()");
> > +                     exit(102);
> > +             }
> > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
> > +                                           &hugepages_info.good_buf_size);
> > +             if (!good_buf) {
> > +                     prterr("init_hugepages_buf failed for good_buf");
> > +                     exit(103);
> > +             }
> > +             hugepages_info.orig_good_buf = good_buf;
> > +
> > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
> > +                                           &hugepages_info.temp_buf_size);
> > +             if (!temp_buf) {
> > +                     prterr("init_hugepages_buf failed for temp_buf");
> > +                     exit(103);
> > +             }
> > +             hugepages_info.orig_temp_buf = temp_buf;
> > +     } else {
> > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > +
> > +             good_buf = calloc(1, good_buf_len);
> > +             temp_buf = calloc(1, temp_buf_len);
> > +     }
> > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > +}
> > +
> >  static struct option longopts[] = {
> >       {"replay-ops", required_argument, 0, 256},
> >       {"record-ops", optional_argument, 0, 255},
> > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> >
> >       while ((ch = getopt_long(argc, argv,
> > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> >                                longopts, NULL)) != EOF)
> >               switch (ch) {
> >               case 'b':
> > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> >               case 'g':
> >                       filldata = *optarg;
> >                       break;
> > +             case 'h':
> > +#ifndef MADV_COLLAPSE
> > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > +                             "Can't support -h\n");
> > +                     exit(86);
> > +#endif
> > +                     hugepages = 1;
> > +                     break;
>
> ...
> if we could change this part to:
>
> #ifdef MADV_COLLAPSE
>         hugepages = 1;
> #endif
>         break;
>
> to avoid the test failures on old systems.
>
> Or any better ideas from you :)

Hi Zorro,

It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
do you think about skipping generic/758 and generic/759 if the kernel
version is older than 6.1? That to me seems more preferable than the
paste above, as the paste above would execute the test as if it did
test hugepages when in reality it didn't, which would be misleading.


Thanks,
Joanne

>
> Thanks,
> Zorro
>
> >               case 'i':
> >                       integrity = 1;
> >                       logdev = strdup(optarg);
> > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> >                       exit(95);
> >               }
> >       }
> > -     original_buf = (char *) malloc(maxfilelen);
> > -     for (i = 0; i < maxfilelen; i++)
> > -             original_buf[i] = random() % 256;
> > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > -     memset(good_buf, '\0', maxfilelen);
> > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > -     memset(temp_buf, '\0', maxoplen);
> > +     init_buffers();
> >       if (lite) {     /* zero entire existing file */
> >               ssize_t written;
> >
> > --
> > 2.47.1
> >
>
Darrick J. Wong Feb. 3, 2025, 6:59 p.m. UTC | #3
On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> >
> > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > Add support for reads/writes from buffers backed by hugepages.
> > > This can be enabled through the '-h' flag. This flag should only be used
> > > on systems where THP capabilities are enabled.
> > >
> > > This is motivated by a recent bug that was due to faulty handling of
> > > userspace buffers backed by hugepages. This patch is a mitigation
> > > against problems like this in the future.
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> >
> > Those two test cases fail on old system which doesn't support
> > MADV_COLLAPSE:
> >
> >    fsx -N 10000 -l 500000 -h
> >   -fsx -N 10000 -o 8192 -l 500000 -h
> >   -fsx -N 10000 -o 128000 -l 500000 -h
> >   +MADV_COLLAPSE not supported. Can't support -h
> >
> > and
> >
> >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   +mapped writes DISABLED
> >   +MADV_COLLAPSE not supported. Can't support -h
> >
> > I'm wondering ...
> >
> > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > index 41933354..3be383c6 100644
> > > --- a/ltp/fsx.c
> > > +++ b/ltp/fsx.c
> > > @@ -190,6 +190,16 @@ int      o_direct;                       /* -Z */
> > >  int  aio = 0;
> > >  int  uring = 0;
> > >  int  mark_nr = 0;
> > > +int  hugepages = 0;                  /* -h flag */
> > > +
> > > +/* Stores info needed to periodically collapse hugepages */
> > > +struct hugepages_collapse_info {
> > > +     void *orig_good_buf;
> > > +     long good_buf_size;
> > > +     void *orig_temp_buf;
> > > +     long temp_buf_size;
> > > +};
> > > +struct hugepages_collapse_info hugepages_info;
> > >
> > >  int page_size;
> > >  int page_mask;
> > > @@ -2471,7 +2481,7 @@ void
> > >  usage(void)
> > >  {
> > >       fprintf(stdout, "usage: %s",
> > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > @@ -2483,8 +2493,11 @@ usage(void)
> > >       -d: debug output for all operations\n\
> > >       -e: pollute post-eof on size changes (default 0)\n\
> > >       -f: flush and invalidate cache after I/O\n\
> > > -     -g X: write character X instead of random generated data\n\
> > > -     -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > +     -g X: write character X instead of random generated data\n"
> > > +#ifdef MADV_COLLAPSE
> > > +"    -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > +#endif
> > > +"    -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > >       -j logid: prefix debug log messsages with this id\n\
> > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > >       -l flen: the upper bound on file size (default 262144)\n\
> > > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str)
> > >  #endif
> > >  }
> > >
> > > +/*
> > > + * Reclaim may break up hugepages, so do a best-effort collapse every once in
> > > + * a while.
> > > + */
> > > +static void
> > > +collapse_hugepages(void)
> > > +{
> > > +#ifdef MADV_COLLAPSE
> > > +     int ret;
> > > +
> > > +     /* re-collapse every 16k fsxops after we start */
> > > +     if (!numops || (numops & ((1U << 14) - 1)))
> > > +             return;
> > > +
> > > +     ret = madvise(hugepages_info.orig_good_buf,
> > > +                   hugepages_info.good_buf_size, MADV_COLLAPSE);
> > > +     if (ret)
> > > +             prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
> > > +                  numops, strerror(errno));
> > > +     ret = madvise(hugepages_info.orig_temp_buf,
> > > +                   hugepages_info.temp_buf_size, MADV_COLLAPSE);
> > > +     if (ret)
> > > +             prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
> > > +                  numops, strerror(errno));
> > > +#endif
> > > +}
> > > +
> > >  bool
> > >  keep_running(void)
> > >  {
> > >       int ret;
> > >
> > > +     if (hugepages)
> > > +             collapse_hugepages();
> > > +
> > >       if (deadline.tv_nsec) {
> > >               struct timespec now;
> > >
> > > @@ -2856,6 +2899,103 @@ keep_running(void)
> > >       return numops-- != 0;
> > >  }
> > >
> > > +static long
> > > +get_hugepage_size(void)
> > > +{
> > > +     const char str[] = "Hugepagesize:";
> > > +     size_t str_len =  sizeof(str) - 1;
> > > +     unsigned int hugepage_size = 0;
> > > +     char buffer[64];
> > > +     FILE *file;
> > > +
> > > +     file = fopen("/proc/meminfo", "r");
> > > +     if (!file) {
> > > +             prterr("get_hugepage_size: fopen /proc/meminfo");
> > > +             return -1;
> > > +     }
> > > +     while (fgets(buffer, sizeof(buffer), file)) {
> > > +             if (strncmp(buffer, str, str_len) == 0) {
> > > +                     sscanf(buffer + str_len, "%u", &hugepage_size);
> > > +                     break;
> > > +             }
> > > +     }
> > > +     fclose(file);
> > > +     if (!hugepage_size) {
> > > +             prterr("get_hugepage_size: failed to find "
> > > +                     "hugepage size in /proc/meminfo\n");
> > > +             return -1;
> > > +     }
> > > +
> > > +     /* convert from KiB to bytes */
> > > +     return hugepage_size << 10;
> > > +}
> > > +
> > > +static void *
> > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
> > > +{
> > > +     void *buf = NULL;
> > > +#ifdef MADV_COLLAPSE
> > > +     int ret;
> > > +     long size = roundup(len, hugepage_size) + alignment;
> > > +
> > > +     ret = posix_memalign(&buf, hugepage_size, size);
> > > +     if (ret) {
> > > +             prterr("posix_memalign for buf");
> > > +             return NULL;
> > > +     }
> > > +     memset(buf, '\0', size);
> > > +     ret = madvise(buf, size, MADV_COLLAPSE);
> > > +     if (ret) {
> > > +             prterr("madvise collapse for buf");
> > > +             free(buf);
> > > +             return NULL;
> > > +     }
> > > +
> > > +     *buf_size = size;
> > > +#endif
> > > +     return buf;
> > > +}
> > > +
> > > +static void
> > > +init_buffers(void)
> > > +{
> > > +     int i;
> > > +
> > > +     original_buf = (char *) malloc(maxfilelen);
> > > +     for (i = 0; i < maxfilelen; i++)
> > > +             original_buf[i] = random() % 256;
> > > +     if (hugepages) {
> > > +             long hugepage_size = get_hugepage_size();
> > > +             if (hugepage_size == -1) {
> > > +                     prterr("get_hugepage_size()");
> > > +                     exit(102);
> > > +             }
> > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
> > > +                                           &hugepages_info.good_buf_size);
> > > +             if (!good_buf) {
> > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > +                     exit(103);
> > > +             }
> > > +             hugepages_info.orig_good_buf = good_buf;
> > > +
> > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
> > > +                                           &hugepages_info.temp_buf_size);
> > > +             if (!temp_buf) {
> > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > +                     exit(103);
> > > +             }
> > > +             hugepages_info.orig_temp_buf = temp_buf;
> > > +     } else {
> > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > +
> > > +             good_buf = calloc(1, good_buf_len);
> > > +             temp_buf = calloc(1, temp_buf_len);
> > > +     }
> > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > +}
> > > +
> > >  static struct option longopts[] = {
> > >       {"replay-ops", required_argument, 0, 256},
> > >       {"record-ops", optional_argument, 0, 255},
> > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > >
> > >       while ((ch = getopt_long(argc, argv,
> > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > >                                longopts, NULL)) != EOF)
> > >               switch (ch) {
> > >               case 'b':
> > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > >               case 'g':
> > >                       filldata = *optarg;
> > >                       break;
> > > +             case 'h':
> > > +#ifndef MADV_COLLAPSE
> > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > +                             "Can't support -h\n");
> > > +                     exit(86);
> > > +#endif
> > > +                     hugepages = 1;
> > > +                     break;
> >
> > ...
> > if we could change this part to:
> >
> > #ifdef MADV_COLLAPSE
> >         hugepages = 1;
> > #endif
> >         break;
> >
> > to avoid the test failures on old systems.
> >
> > Or any better ideas from you :)
> 
> Hi Zorro,
> 
> It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> do you think about skipping generic/758 and generic/759 if the kernel
> version is older than 6.1? That to me seems more preferable than the
> paste above, as the paste above would execute the test as if it did
> test hugepages when in reality it didn't, which would be misleading.

Now that I've gotten to try this out --

There's a couple of things going on here.  The first is that generic/759
and 760 need to check if invoking fsx -h causes it to spit out the
"MADV_COLLAPSE not supported" error and _notrun the test.

The second thing is that userspace programs can ensure the existence of
MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
which requires that the underlying C library headers are new enough to
include a definition.  glibc 2.37 is new enough, but even things like
Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
might not follow glibc's practice of wrapping and/or redefining symbols
in a way that you hope is the same as...

The second way is through linux/mman.h, which comes from the kernel
headers package; and the third way is for the program to define it
itself if nobody else does.

So I think the easiest way to fix the fsx.c build is to include
linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
details when I reviewed your patch; I'm a little attention constrained
ATM trying to get a large pile of bugfixes and redesigns reviewed so
for-next can finally move forward again.

--D

> 
> Thanks,
> Joanne
> 
> >
> > Thanks,
> > Zorro
> >
> > >               case 'i':
> > >                       integrity = 1;
> > >                       logdev = strdup(optarg);
> > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> > >                       exit(95);
> > >               }
> > >       }
> > > -     original_buf = (char *) malloc(maxfilelen);
> > > -     for (i = 0; i < maxfilelen; i++)
> > > -             original_buf[i] = random() % 256;
> > > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > -     memset(good_buf, '\0', maxfilelen);
> > > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > -     memset(temp_buf, '\0', maxoplen);
> > > +     init_buffers();
> > >       if (lite) {     /* zero entire existing file */
> > >               ssize_t written;
> > >
> > > --
> > > 2.47.1
> > >
> >
Joanne Koong Feb. 3, 2025, 7:23 p.m. UTC | #4
On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > >
> > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > Add support for reads/writes from buffers backed by hugepages.
> > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > on systems where THP capabilities are enabled.
> > > >
> > > > This is motivated by a recent bug that was due to faulty handling of
> > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > against problems like this in the future.
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > >
> > > Those two test cases fail on old system which doesn't support
> > > MADV_COLLAPSE:
> > >
> > >    fsx -N 10000 -l 500000 -h
> > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > >   +MADV_COLLAPSE not supported. Can't support -h
> > >
> > > and
> > >
> > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   +mapped writes DISABLED
> > >   +MADV_COLLAPSE not supported. Can't support -h
> > >
> > > I'm wondering ...
> > >
> > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > index 41933354..3be383c6 100644
> > > > --- a/ltp/fsx.c
> > > > +++ b/ltp/fsx.c
> > > >  static struct option longopts[] = {
> > > >       {"replay-ops", required_argument, 0, 256},
> > > >       {"record-ops", optional_argument, 0, 255},
> > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > >
> > > >       while ((ch = getopt_long(argc, argv,
> > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > >                                longopts, NULL)) != EOF)
> > > >               switch (ch) {
> > > >               case 'b':
> > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > >               case 'g':
> > > >                       filldata = *optarg;
> > > >                       break;
> > > > +             case 'h':
> > > > +#ifndef MADV_COLLAPSE
> > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > +                             "Can't support -h\n");
> > > > +                     exit(86);
> > > > +#endif
> > > > +                     hugepages = 1;
> > > > +                     break;
> > >
> > > ...
> > > if we could change this part to:
> > >
> > > #ifdef MADV_COLLAPSE
> > >         hugepages = 1;
> > > #endif
> > >         break;
> > >
> > > to avoid the test failures on old systems.
> > >
> > > Or any better ideas from you :)
> >
> > Hi Zorro,
> >
> > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > do you think about skipping generic/758 and generic/759 if the kernel
> > version is older than 6.1? That to me seems more preferable than the
> > paste above, as the paste above would execute the test as if it did
> > test hugepages when in reality it didn't, which would be misleading.
>
> Now that I've gotten to try this out --
>
> There's a couple of things going on here.  The first is that generic/759
> and 760 need to check if invoking fsx -h causes it to spit out the
> "MADV_COLLAPSE not supported" error and _notrun the test.
>
> The second thing is that userspace programs can ensure the existence of
> MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> which requires that the underlying C library headers are new enough to
> include a definition.  glibc 2.37 is new enough, but even things like
> Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> might not follow glibc's practice of wrapping and/or redefining symbols
> in a way that you hope is the same as...
>
> The second way is through linux/mman.h, which comes from the kernel
> headers package; and the third way is for the program to define it
> itself if nobody else does.
>
> So I think the easiest way to fix the fsx.c build is to include
> linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these

Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?

For generic/758 and 759, does it suffice to gate this on whether the
kernel version if 6.1+ and _notrun if not? My understanding is that
any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
headers package and support the feature.


Thanks,
Joanne

> details when I reviewed your patch; I'm a little attention constrained
> ATM trying to get a large pile of bugfixes and redesigns reviewed so
> for-next can finally move forward again.
>
> --D
>
> >
> > Thanks,
> > Joanne
> >
> > >
> > > Thanks,
> > > Zorro
> > >
> > > >               case 'i':
> > > >                       integrity = 1;
> > > >                       logdev = strdup(optarg);
> > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> > > >                       exit(95);
> > > >               }
> > > >       }
> > > > -     original_buf = (char *) malloc(maxfilelen);
> > > > -     for (i = 0; i < maxfilelen; i++)
> > > > -             original_buf[i] = random() % 256;
> > > > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > > > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > -     memset(good_buf, '\0', maxfilelen);
> > > > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > > > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > -     memset(temp_buf, '\0', maxoplen);
> > > > +     init_buffers();
> > > >       if (lite) {     /* zero entire existing file */
> > > >               ssize_t written;
> > > >
> > > > --
> > > > 2.47.1
> > > >
> > >
Darrick J. Wong Feb. 3, 2025, 7:41 p.m. UTC | #5
On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > >
> > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > on systems where THP capabilities are enabled.
> > > > >
> > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > against problems like this in the future.
> > > > >
> > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > ---
> > > >
> > > > Those two test cases fail on old system which doesn't support
> > > > MADV_COLLAPSE:
> > > >
> > > >    fsx -N 10000 -l 500000 -h
> > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > >
> > > > and
> > > >
> > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > >   +mapped writes DISABLED
> > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > >
> > > > I'm wondering ...
> > > >
> > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > >
> > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > index 41933354..3be383c6 100644
> > > > > --- a/ltp/fsx.c
> > > > > +++ b/ltp/fsx.c
> > > > >  static struct option longopts[] = {
> > > > >       {"replay-ops", required_argument, 0, 256},
> > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > >
> > > > >       while ((ch = getopt_long(argc, argv,
> > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > >                                longopts, NULL)) != EOF)
> > > > >               switch (ch) {
> > > > >               case 'b':
> > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > >               case 'g':
> > > > >                       filldata = *optarg;
> > > > >                       break;
> > > > > +             case 'h':
> > > > > +#ifndef MADV_COLLAPSE
> > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > +                             "Can't support -h\n");
> > > > > +                     exit(86);
> > > > > +#endif
> > > > > +                     hugepages = 1;
> > > > > +                     break;
> > > >
> > > > ...
> > > > if we could change this part to:
> > > >
> > > > #ifdef MADV_COLLAPSE
> > > >         hugepages = 1;
> > > > #endif
> > > >         break;
> > > >
> > > > to avoid the test failures on old systems.
> > > >
> > > > Or any better ideas from you :)
> > >
> > > Hi Zorro,
> > >
> > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > do you think about skipping generic/758 and generic/759 if the kernel
> > > version is older than 6.1? That to me seems more preferable than the
> > > paste above, as the paste above would execute the test as if it did
> > > test hugepages when in reality it didn't, which would be misleading.
> >
> > Now that I've gotten to try this out --
> >
> > There's a couple of things going on here.  The first is that generic/759
> > and 760 need to check if invoking fsx -h causes it to spit out the
> > "MADV_COLLAPSE not supported" error and _notrun the test.
> >
> > The second thing is that userspace programs can ensure the existence of
> > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > which requires that the underlying C library headers are new enough to
> > include a definition.  glibc 2.37 is new enough, but even things like
> > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > might not follow glibc's practice of wrapping and/or redefining symbols
> > in a way that you hope is the same as...
> >
> > The second way is through linux/mman.h, which comes from the kernel
> > headers package; and the third way is for the program to define it
> > itself if nobody else does.
> >
> > So I think the easiest way to fix the fsx.c build is to include
> > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> 
> Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?

Yes, because glibc provides the mmap() function that wraps
syscall(__NR_mmap, ...);

> For generic/758 and 759, does it suffice to gate this on whether the
> kernel version if 6.1+ and _notrun if not? My understanding is that
> any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> headers package and support the feature.

No, because some (most?) vendors backport new features into existing
kernels without revving the version number of that kernel.

Maybe the following fixes things?

--D

generic/759,760: fix MADV_COLLAPSE detection and inclusion

On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
MADV_COLLAPSE flag might not be defined in any of the header files
pulled in by sys/mman.h, which means that the fsx binary might not get
built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
the test will fail with:

>  QA output created by 760
>  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> +mapped writes DISABLED
> +MADV_COLLAPSE not supported. Can't support -h

Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
then fix fsx.c to include the mman.h from the kernel headers (aka
linux/mman.h) so that we can actually test IOs to and from THPs if the
kernel is newer than the rest of userspace.

Cc: <fstests@vger.kernel.org> # v2025.02.02
Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 ltp/fsx.c         |    1 +
 tests/generic/759 |    3 +++
 tests/generic/760 |    3 +++
 3 files changed, 7 insertions(+)

diff --git a/ltp/fsx.c b/ltp/fsx.c
index 634c496ffe9317..cf9502a74c17a7 100644
--- a/ltp/fsx.c
+++ b/ltp/fsx.c
@@ -20,6 +20,7 @@
 #include <strings.h>
 #include <sys/file.h>
 #include <sys/mman.h>
+#include <linux/mman.h>
 #include <sys/uio.h>
 #include <stdbool.h>
 #ifdef HAVE_ERR_H
diff --git a/tests/generic/759 b/tests/generic/759
index 6c74478aa8a0e0..3549c5ed6a9299 100755
--- a/tests/generic/759
+++ b/tests/generic/759
@@ -14,6 +14,9 @@ _begin_fstest rw auto quick
 _require_test
 _require_thp
 
+$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
+	_notrun "fsx binary does not support MADV_COLLAPSE"
+
 run_fsx -N 10000            -l 500000 -h
 run_fsx -N 10000  -o 8192   -l 500000 -h
 run_fsx -N 10000  -o 128000 -l 500000 -h
diff --git a/tests/generic/760 b/tests/generic/760
index c71a196222ad3b..2fbd28502ae678 100755
--- a/tests/generic/760
+++ b/tests/generic/760
@@ -15,6 +15,9 @@ _require_test
 _require_odirect
 _require_thp
 
+$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
+	_notrun "fsx binary does not support MADV_COLLAPSE"
+
 psize=`$here/src/feature -s`
 bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
Joanne Koong Feb. 3, 2025, 7:57 p.m. UTC | #6
On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > >
> > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > on systems where THP capabilities are enabled.
> > > > > >
> > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > against problems like this in the future.
> > > > > >
> > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > ---
> > > > >
> > > > > Those two test cases fail on old system which doesn't support
> > > > > MADV_COLLAPSE:
> > > > >
> > > > >    fsx -N 10000 -l 500000 -h
> > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > >
> > > > > and
> > > > >
> > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > >   +mapped writes DISABLED
> > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > >
> > > > > I'm wondering ...
> > > > >
> > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > >
> > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > index 41933354..3be383c6 100644
> > > > > > --- a/ltp/fsx.c
> > > > > > +++ b/ltp/fsx.c
> > > > > >  static struct option longopts[] = {
> > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > >
> > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > >                                longopts, NULL)) != EOF)
> > > > > >               switch (ch) {
> > > > > >               case 'b':
> > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > >               case 'g':
> > > > > >                       filldata = *optarg;
> > > > > >                       break;
> > > > > > +             case 'h':
> > > > > > +#ifndef MADV_COLLAPSE
> > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > +                             "Can't support -h\n");
> > > > > > +                     exit(86);
> > > > > > +#endif
> > > > > > +                     hugepages = 1;
> > > > > > +                     break;
> > > > >
> > > > > ...
> > > > > if we could change this part to:
> > > > >
> > > > > #ifdef MADV_COLLAPSE
> > > > >         hugepages = 1;
> > > > > #endif
> > > > >         break;
> > > > >
> > > > > to avoid the test failures on old systems.
> > > > >
> > > > > Or any better ideas from you :)
> > > >
> > > > Hi Zorro,
> > > >
> > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > version is older than 6.1? That to me seems more preferable than the
> > > > paste above, as the paste above would execute the test as if it did
> > > > test hugepages when in reality it didn't, which would be misleading.
> > >
> > > Now that I've gotten to try this out --
> > >
> > > There's a couple of things going on here.  The first is that generic/759
> > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > >
> > > The second thing is that userspace programs can ensure the existence of
> > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > which requires that the underlying C library headers are new enough to
> > > include a definition.  glibc 2.37 is new enough, but even things like
> > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > in a way that you hope is the same as...
> > >
> > > The second way is through linux/mman.h, which comes from the kernel
> > > headers package; and the third way is for the program to define it
> > > itself if nobody else does.
> > >
> > > So I think the easiest way to fix the fsx.c build is to include
> > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> >
> > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
>
> Yes, because glibc provides the mmap() function that wraps
> syscall(__NR_mmap, ...);
>
> > For generic/758 and 759, does it suffice to gate this on whether the
> > kernel version if 6.1+ and _notrun if not? My understanding is that
> > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > headers package and support the feature.
>
> No, because some (most?) vendors backport new features into existing
> kernels without revving the version number of that kernel.

Oh okay, I see. That makes sense, thanks for the explanation.

>
> Maybe the following fixes things?
>
> --D
>
> generic/759,760: fix MADV_COLLAPSE detection and inclusion
>
> On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> MADV_COLLAPSE flag might not be defined in any of the header files
> pulled in by sys/mman.h, which means that the fsx binary might not get
> built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> the test will fail with:
>
> >  QA output created by 760
> >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > +mapped writes DISABLED
> > +MADV_COLLAPSE not supported. Can't support -h
>
> Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> then fix fsx.c to include the mman.h from the kernel headers (aka
> linux/mman.h) so that we can actually test IOs to and from THPs if the
> kernel is newer than the rest of userspace.
>
> Cc: <fstests@vger.kernel.org> # v2025.02.02
> Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
>  ltp/fsx.c         |    1 +
>  tests/generic/759 |    3 +++
>  tests/generic/760 |    3 +++
>  3 files changed, 7 insertions(+)
>
> diff --git a/ltp/fsx.c b/ltp/fsx.c
> index 634c496ffe9317..cf9502a74c17a7 100644
> --- a/ltp/fsx.c
> +++ b/ltp/fsx.c
> @@ -20,6 +20,7 @@
>  #include <strings.h>
>  #include <sys/file.h>
>  #include <sys/mman.h>
> +#include <linux/mman.h>
>  #include <sys/uio.h>
>  #include <stdbool.h>
>  #ifdef HAVE_ERR_H
> diff --git a/tests/generic/759 b/tests/generic/759
> index 6c74478aa8a0e0..3549c5ed6a9299 100755
> --- a/tests/generic/759
> +++ b/tests/generic/759
> @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
>  _require_test
>  _require_thp
>
> +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> +       _notrun "fsx binary does not support MADV_COLLAPSE"
> +
>  run_fsx -N 10000            -l 500000 -h
>  run_fsx -N 10000  -o 8192   -l 500000 -h
>  run_fsx -N 10000  -o 128000 -l 500000 -h
> diff --git a/tests/generic/760 b/tests/generic/760
> index c71a196222ad3b..2fbd28502ae678 100755
> --- a/tests/generic/760
> +++ b/tests/generic/760
> @@ -15,6 +15,9 @@ _require_test
>  _require_odirect
>  _require_thp
>
> +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> +       _notrun "fsx binary does not support MADV_COLLAPSE"
> +

I tried this out locally and it works for me:

generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
Ran: generic/759
Not run: generic/759
Passed all 1 tests

SECTION       -- fuse
=========================
Ran: generic/759
Not run: generic/759
Passed all 1 tests


Thanks,
Joanne

>  psize=`$here/src/feature -s`
>  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
>
Darrick J. Wong Feb. 3, 2025, 8:01 p.m. UTC | #7
On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > >
> > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > on systems where THP capabilities are enabled.
> > > > > > >
> > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > against problems like this in the future.
> > > > > > >
> > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > ---
> > > > > >
> > > > > > Those two test cases fail on old system which doesn't support
> > > > > > MADV_COLLAPSE:
> > > > > >
> > > > > >    fsx -N 10000 -l 500000 -h
> > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > >
> > > > > > and
> > > > > >
> > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > >   +mapped writes DISABLED
> > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > >
> > > > > > I'm wondering ...
> > > > > >
> > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > >
> > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > index 41933354..3be383c6 100644
> > > > > > > --- a/ltp/fsx.c
> > > > > > > +++ b/ltp/fsx.c
> > > > > > >  static struct option longopts[] = {
> > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > >
> > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > >                                longopts, NULL)) != EOF)
> > > > > > >               switch (ch) {
> > > > > > >               case 'b':
> > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > >               case 'g':
> > > > > > >                       filldata = *optarg;
> > > > > > >                       break;
> > > > > > > +             case 'h':
> > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > +                             "Can't support -h\n");
> > > > > > > +                     exit(86);
> > > > > > > +#endif
> > > > > > > +                     hugepages = 1;
> > > > > > > +                     break;
> > > > > >
> > > > > > ...
> > > > > > if we could change this part to:
> > > > > >
> > > > > > #ifdef MADV_COLLAPSE
> > > > > >         hugepages = 1;
> > > > > > #endif
> > > > > >         break;
> > > > > >
> > > > > > to avoid the test failures on old systems.
> > > > > >
> > > > > > Or any better ideas from you :)
> > > > >
> > > > > Hi Zorro,
> > > > >
> > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > paste above, as the paste above would execute the test as if it did
> > > > > test hugepages when in reality it didn't, which would be misleading.
> > > >
> > > > Now that I've gotten to try this out --
> > > >
> > > > There's a couple of things going on here.  The first is that generic/759
> > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > >
> > > > The second thing is that userspace programs can ensure the existence of
> > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > which requires that the underlying C library headers are new enough to
> > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > in a way that you hope is the same as...
> > > >
> > > > The second way is through linux/mman.h, which comes from the kernel
> > > > headers package; and the third way is for the program to define it
> > > > itself if nobody else does.
> > > >
> > > > So I think the easiest way to fix the fsx.c build is to include
> > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > >
> > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> >
> > Yes, because glibc provides the mmap() function that wraps
> > syscall(__NR_mmap, ...);
> >
> > > For generic/758 and 759, does it suffice to gate this on whether the
> > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > headers package and support the feature.
> >
> > No, because some (most?) vendors backport new features into existing
> > kernels without revving the version number of that kernel.
> 
> Oh okay, I see. That makes sense, thanks for the explanation.
> 
> >
> > Maybe the following fixes things?
> >
> > --D
> >
> > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> >
> > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > MADV_COLLAPSE flag might not be defined in any of the header files
> > pulled in by sys/mman.h, which means that the fsx binary might not get
> > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > the test will fail with:
> >
> > >  QA output created by 760
> > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > +mapped writes DISABLED
> > > +MADV_COLLAPSE not supported. Can't support -h
> >
> > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > then fix fsx.c to include the mman.h from the kernel headers (aka
> > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > kernel is newer than the rest of userspace.
> >
> > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> >  ltp/fsx.c         |    1 +
> >  tests/generic/759 |    3 +++
> >  tests/generic/760 |    3 +++
> >  3 files changed, 7 insertions(+)
> >
> > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > index 634c496ffe9317..cf9502a74c17a7 100644
> > --- a/ltp/fsx.c
> > +++ b/ltp/fsx.c
> > @@ -20,6 +20,7 @@
> >  #include <strings.h>
> >  #include <sys/file.h>
> >  #include <sys/mman.h>
> > +#include <linux/mman.h>
> >  #include <sys/uio.h>
> >  #include <stdbool.h>
> >  #ifdef HAVE_ERR_H
> > diff --git a/tests/generic/759 b/tests/generic/759
> > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > --- a/tests/generic/759
> > +++ b/tests/generic/759
> > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> >  _require_test
> >  _require_thp
> >
> > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > +
> >  run_fsx -N 10000            -l 500000 -h
> >  run_fsx -N 10000  -o 8192   -l 500000 -h
> >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > diff --git a/tests/generic/760 b/tests/generic/760
> > index c71a196222ad3b..2fbd28502ae678 100755
> > --- a/tests/generic/760
> > +++ b/tests/generic/760
> > @@ -15,6 +15,9 @@ _require_test
> >  _require_odirect
> >  _require_thp
> >
> > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > +
> 
> I tried this out locally and it works for me:
> 
> generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> Ran: generic/759
> Not run: generic/759
> Passed all 1 tests
> 
> SECTION       -- fuse
> =========================
> Ran: generic/759
> Not run: generic/759
> Passed all 1 tests

Does the test actually run if you /do/ have kernel/libc headers that
define MADV_COLLAPSE?  And if so, does that count as a Tested-by?

--D

> 
> Thanks,
> Joanne
> 
> >  psize=`$here/src/feature -s`
> >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> >
>
Joanne Koong Feb. 3, 2025, 9:40 p.m. UTC | #8
On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > >
> > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > >
> > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > against problems like this in the future.
> > > > > > > >
> > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > ---
> > > > > > >
> > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > MADV_COLLAPSE:
> > > > > > >
> > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > >
> > > > > > > and
> > > > > > >
> > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > >   +mapped writes DISABLED
> > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > >
> > > > > > > I'm wondering ...
> > > > > > >
> > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > --- a/ltp/fsx.c
> > > > > > > > +++ b/ltp/fsx.c
> > > > > > > >  static struct option longopts[] = {
> > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > >
> > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > >               switch (ch) {
> > > > > > > >               case 'b':
> > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > >               case 'g':
> > > > > > > >                       filldata = *optarg;
> > > > > > > >                       break;
> > > > > > > > +             case 'h':
> > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > +                             "Can't support -h\n");
> > > > > > > > +                     exit(86);
> > > > > > > > +#endif
> > > > > > > > +                     hugepages = 1;
> > > > > > > > +                     break;
> > > > > > >
> > > > > > > ...
> > > > > > > if we could change this part to:
> > > > > > >
> > > > > > > #ifdef MADV_COLLAPSE
> > > > > > >         hugepages = 1;
> > > > > > > #endif
> > > > > > >         break;
> > > > > > >
> > > > > > > to avoid the test failures on old systems.
> > > > > > >
> > > > > > > Or any better ideas from you :)
> > > > > >
> > > > > > Hi Zorro,
> > > > > >
> > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > >
> > > > > Now that I've gotten to try this out --
> > > > >
> > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > >
> > > > > The second thing is that userspace programs can ensure the existence of
> > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > which requires that the underlying C library headers are new enough to
> > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > in a way that you hope is the same as...
> > > > >
> > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > headers package; and the third way is for the program to define it
> > > > > itself if nobody else does.
> > > > >
> > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > >
> > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > >
> > > Yes, because glibc provides the mmap() function that wraps
> > > syscall(__NR_mmap, ...);
> > >
> > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > headers package and support the feature.
> > >
> > > No, because some (most?) vendors backport new features into existing
> > > kernels without revving the version number of that kernel.
> >
> > Oh okay, I see. That makes sense, thanks for the explanation.
> >
> > >
> > > Maybe the following fixes things?
> > >
> > > --D
> > >
> > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > >
> > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > the test will fail with:
> > >
> > > >  QA output created by 760
> > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > +mapped writes DISABLED
> > > > +MADV_COLLAPSE not supported. Can't support -h
> > >
> > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > kernel is newer than the rest of userspace.
> > >
> > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > ---
> > >  ltp/fsx.c         |    1 +
> > >  tests/generic/759 |    3 +++
> > >  tests/generic/760 |    3 +++
> > >  3 files changed, 7 insertions(+)
> > >
> > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > --- a/ltp/fsx.c
> > > +++ b/ltp/fsx.c
> > > @@ -20,6 +20,7 @@
> > >  #include <strings.h>
> > >  #include <sys/file.h>
> > >  #include <sys/mman.h>
> > > +#include <linux/mman.h>
> > >  #include <sys/uio.h>
> > >  #include <stdbool.h>
> > >  #ifdef HAVE_ERR_H
> > > diff --git a/tests/generic/759 b/tests/generic/759
> > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > --- a/tests/generic/759
> > > +++ b/tests/generic/759
> > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > >  _require_test
> > >  _require_thp
> > >
> > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > +
> > >  run_fsx -N 10000            -l 500000 -h
> > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > diff --git a/tests/generic/760 b/tests/generic/760
> > > index c71a196222ad3b..2fbd28502ae678 100755
> > > --- a/tests/generic/760
> > > +++ b/tests/generic/760
> > > @@ -15,6 +15,9 @@ _require_test
> > >  _require_odirect
> > >  _require_thp
> > >
> > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > +
> >
> > I tried this out locally and it works for me:
> >
> > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > Ran: generic/759
> > Not run: generic/759
> > Passed all 1 tests
> >
> > SECTION       -- fuse
> > =========================
> > Ran: generic/759
> > Not run: generic/759
> > Passed all 1 tests
>
> Does the test actually run if you /do/ have kernel/libc headers that
> define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
>

I'm not sure if I fully understand the subtext of your question but
yes, the test runs on my system when MADV_COLLAPSE is defined in the
kernel/libc headers.
For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
I tried this out by modifying the ifdef/ifndef checks in fsx to
MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
grep -q 'MADV_COLLAPSE not supported'' line catches that.


> --D
>
> >
> > Thanks,
> > Joanne
> >
> > >  psize=`$here/src/feature -s`
> > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > >
> >
Darrick J. Wong Feb. 3, 2025, 9:54 p.m. UTC | #9
On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote:
> On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > >
> > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > > >
> > > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > > against problems like this in the future.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > > ---
> > > > > > > >
> > > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > > MADV_COLLAPSE:
> > > > > > > >
> > > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > >
> > > > > > > > and
> > > > > > > >
> > > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > >   +mapped writes DISABLED
> > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > >
> > > > > > > > I'm wondering ...
> > > > > > > >
> > > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > > --- a/ltp/fsx.c
> > > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > >  static struct option longopts[] = {
> > > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > > >
> > > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > > >               switch (ch) {
> > > > > > > > >               case 'b':
> > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > > >               case 'g':
> > > > > > > > >                       filldata = *optarg;
> > > > > > > > >                       break;
> > > > > > > > > +             case 'h':
> > > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > > +                             "Can't support -h\n");
> > > > > > > > > +                     exit(86);
> > > > > > > > > +#endif
> > > > > > > > > +                     hugepages = 1;
> > > > > > > > > +                     break;
> > > > > > > >
> > > > > > > > ...
> > > > > > > > if we could change this part to:
> > > > > > > >
> > > > > > > > #ifdef MADV_COLLAPSE
> > > > > > > >         hugepages = 1;
> > > > > > > > #endif
> > > > > > > >         break;
> > > > > > > >
> > > > > > > > to avoid the test failures on old systems.
> > > > > > > >
> > > > > > > > Or any better ideas from you :)
> > > > > > >
> > > > > > > Hi Zorro,
> > > > > > >
> > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > > >
> > > > > > Now that I've gotten to try this out --
> > > > > >
> > > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > > >
> > > > > > The second thing is that userspace programs can ensure the existence of
> > > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > > which requires that the underlying C library headers are new enough to
> > > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > > in a way that you hope is the same as...
> > > > > >
> > > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > > headers package; and the third way is for the program to define it
> > > > > > itself if nobody else does.
> > > > > >
> > > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > > >
> > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > > >
> > > > Yes, because glibc provides the mmap() function that wraps
> > > > syscall(__NR_mmap, ...);
> > > >
> > > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > > headers package and support the feature.
> > > >
> > > > No, because some (most?) vendors backport new features into existing
> > > > kernels without revving the version number of that kernel.
> > >
> > > Oh okay, I see. That makes sense, thanks for the explanation.
> > >
> > > >
> > > > Maybe the following fixes things?
> > > >
> > > > --D
> > > >
> > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > > >
> > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > > the test will fail with:
> > > >
> > > > >  QA output created by 760
> > > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > +mapped writes DISABLED
> > > > > +MADV_COLLAPSE not supported. Can't support -h
> > > >
> > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > > kernel is newer than the rest of userspace.
> > > >
> > > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > > ---
> > > >  ltp/fsx.c         |    1 +
> > > >  tests/generic/759 |    3 +++
> > > >  tests/generic/760 |    3 +++
> > > >  3 files changed, 7 insertions(+)
> > > >
> > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > > --- a/ltp/fsx.c
> > > > +++ b/ltp/fsx.c
> > > > @@ -20,6 +20,7 @@
> > > >  #include <strings.h>
> > > >  #include <sys/file.h>
> > > >  #include <sys/mman.h>
> > > > +#include <linux/mman.h>
> > > >  #include <sys/uio.h>
> > > >  #include <stdbool.h>
> > > >  #ifdef HAVE_ERR_H
> > > > diff --git a/tests/generic/759 b/tests/generic/759
> > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > > --- a/tests/generic/759
> > > > +++ b/tests/generic/759
> > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > > >  _require_test
> > > >  _require_thp
> > > >
> > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > +
> > > >  run_fsx -N 10000            -l 500000 -h
> > > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > > diff --git a/tests/generic/760 b/tests/generic/760
> > > > index c71a196222ad3b..2fbd28502ae678 100755
> > > > --- a/tests/generic/760
> > > > +++ b/tests/generic/760
> > > > @@ -15,6 +15,9 @@ _require_test
> > > >  _require_odirect
> > > >  _require_thp
> > > >
> > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > +
> > >
> > > I tried this out locally and it works for me:
> > >
> > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > > Ran: generic/759
> > > Not run: generic/759
> > > Passed all 1 tests
> > >
> > > SECTION       -- fuse
> > > =========================
> > > Ran: generic/759
> > > Not run: generic/759
> > > Passed all 1 tests
> >
> > Does the test actually run if you /do/ have kernel/libc headers that
> > define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
> >
> 
> I'm not sure if I fully understand the subtext of your question but
> yes, the test runs on my system when MADV_COLLAPSE is defined in the
> kernel/libc headers.

Yep, you understood me correctly. :)

> For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
> I tried this out by modifying the ifdef/ifndef checks in fsx to
> MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
> grep -q 'MADV_COLLAPSE not supported'' line catches that.

<nod> Sounds good then; I'll add this to my test clod and run it
overnight.

--D

> 
> > --D
> >
> > >
> > > Thanks,
> > > Joanne
> > >
> > > >  psize=`$here/src/feature -s`
> > > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > > >
> > >
Darrick J. Wong Feb. 4, 2025, 4:21 a.m. UTC | #10
On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote:
> On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote:
> > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > >
> > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > >
> > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > > > >
> > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > > > against problems like this in the future.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > > > ---
> > > > > > > > >
> > > > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > > > MADV_COLLAPSE:
> > > > > > > > >
> > > > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > >
> > > > > > > > > and
> > > > > > > > >
> > > > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > >   +mapped writes DISABLED
> > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > >
> > > > > > > > > I'm wondering ...
> > > > > > > > >
> > > > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > > > --- a/ltp/fsx.c
> > > > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > > >  static struct option longopts[] = {
> > > > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > > > >
> > > > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > > > >               switch (ch) {
> > > > > > > > > >               case 'b':
> > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > > > >               case 'g':
> > > > > > > > > >                       filldata = *optarg;
> > > > > > > > > >                       break;
> > > > > > > > > > +             case 'h':
> > > > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > > > +                             "Can't support -h\n");
> > > > > > > > > > +                     exit(86);
> > > > > > > > > > +#endif
> > > > > > > > > > +                     hugepages = 1;
> > > > > > > > > > +                     break;
> > > > > > > > >
> > > > > > > > > ...
> > > > > > > > > if we could change this part to:
> > > > > > > > >
> > > > > > > > > #ifdef MADV_COLLAPSE
> > > > > > > > >         hugepages = 1;
> > > > > > > > > #endif
> > > > > > > > >         break;
> > > > > > > > >
> > > > > > > > > to avoid the test failures on old systems.
> > > > > > > > >
> > > > > > > > > Or any better ideas from you :)
> > > > > > > >
> > > > > > > > Hi Zorro,
> > > > > > > >
> > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > > > >
> > > > > > > Now that I've gotten to try this out --
> > > > > > >
> > > > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > > > >
> > > > > > > The second thing is that userspace programs can ensure the existence of
> > > > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > > > which requires that the underlying C library headers are new enough to
> > > > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > > > in a way that you hope is the same as...
> > > > > > >
> > > > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > > > headers package; and the third way is for the program to define it
> > > > > > > itself if nobody else does.
> > > > > > >
> > > > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > > > >
> > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > > > >
> > > > > Yes, because glibc provides the mmap() function that wraps
> > > > > syscall(__NR_mmap, ...);
> > > > >
> > > > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > > > headers package and support the feature.
> > > > >
> > > > > No, because some (most?) vendors backport new features into existing
> > > > > kernels without revving the version number of that kernel.
> > > >
> > > > Oh okay, I see. That makes sense, thanks for the explanation.
> > > >
> > > > >
> > > > > Maybe the following fixes things?
> > > > >
> > > > > --D
> > > > >
> > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > > > >
> > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > > > the test will fail with:
> > > > >
> > > > > >  QA output created by 760
> > > > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > +mapped writes DISABLED
> > > > > > +MADV_COLLAPSE not supported. Can't support -h
> > > > >
> > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > > > kernel is newer than the rest of userspace.
> > > > >
> > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > > > ---
> > > > >  ltp/fsx.c         |    1 +
> > > > >  tests/generic/759 |    3 +++
> > > > >  tests/generic/760 |    3 +++
> > > > >  3 files changed, 7 insertions(+)
> > > > >
> > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > > > --- a/ltp/fsx.c
> > > > > +++ b/ltp/fsx.c
> > > > > @@ -20,6 +20,7 @@
> > > > >  #include <strings.h>
> > > > >  #include <sys/file.h>
> > > > >  #include <sys/mman.h>
> > > > > +#include <linux/mman.h>
> > > > >  #include <sys/uio.h>
> > > > >  #include <stdbool.h>
> > > > >  #ifdef HAVE_ERR_H
> > > > > diff --git a/tests/generic/759 b/tests/generic/759
> > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > > > --- a/tests/generic/759
> > > > > +++ b/tests/generic/759
> > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > > > >  _require_test
> > > > >  _require_thp
> > > > >
> > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > +
> > > > >  run_fsx -N 10000            -l 500000 -h
> > > > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > > > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > > > diff --git a/tests/generic/760 b/tests/generic/760
> > > > > index c71a196222ad3b..2fbd28502ae678 100755
> > > > > --- a/tests/generic/760
> > > > > +++ b/tests/generic/760
> > > > > @@ -15,6 +15,9 @@ _require_test
> > > > >  _require_odirect
> > > > >  _require_thp
> > > > >
> > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > +
> > > >
> > > > I tried this out locally and it works for me:
> > > >
> > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > > > Ran: generic/759
> > > > Not run: generic/759
> > > > Passed all 1 tests
> > > >
> > > > SECTION       -- fuse
> > > > =========================
> > > > Ran: generic/759
> > > > Not run: generic/759
> > > > Passed all 1 tests
> > >
> > > Does the test actually run if you /do/ have kernel/libc headers that
> > > define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
> > >
> > 
> > I'm not sure if I fully understand the subtext of your question but
> > yes, the test runs on my system when MADV_COLLAPSE is defined in the
> > kernel/libc headers.
> 
> Yep, you understood me correctly. :)
> 
> > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
> > I tried this out by modifying the ifdef/ifndef checks in fsx to
> > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
> > grep -q 'MADV_COLLAPSE not supported'' line catches that.
> 
> <nod> Sounds good then; I'll add this to my test clod and run it
> overnight.

The arm64 vms with 64k base pages spat out this:

--- /run/fstests/bin/tests/generic/759.out	2025-02-02 08:36:28.007248055 -0800
+++ /var/tmp/fstests/generic/759.out.bad	2025-02-03 16:51:34.862964640 -0800
@@ -1,4 +1,5 @@
 QA output created by 759
 fsx -N 10000 -l 500000 -h
-fsx -N 10000 -o 8192 -l 500000 -h
-fsx -N 10000 -o 128000 -l 500000 -h
+Seed set to 1
+madvise collapse for buf: Cannot allocate memory
+init_hugepages_buf failed for good_buf: Cannot allocate memory

Note that it was trying to create a 512M page(!) on a VM with 8G of
memory.  Normally one doesn't use large base page size in low memory
environments, but this /is/ fstestsland. 8-)

From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync
hugepage collapse") :

	ENOMEM	Memory allocation failed or VMA not found
	EBUSY	Memcg charging failed
	EAGAIN	Required resource temporarily unavailable.  Try again
		might succeed.
	EINVAL	Other error: No PMD found, subpage doesn't have Present
		bit set, "Special" page no backed by struct page, VMA
		incorrectly sized, address not page-aligned, ...

It sounds like ENOMEM/EBUSY/EINVAL should result in
_notrun "Could not generate hugepage" ?  What are your thoughts?

--D

> --D
> 
> > 
> > > --D
> > >
> > > >
> > > > Thanks,
> > > > Joanne
> > > >
> > > > >  psize=`$here/src/feature -s`
> > > > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > > > >
> > > >
>
Zorro Lang Feb. 4, 2025, 5:05 p.m. UTC | #11
On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> >
> > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > Add support for reads/writes from buffers backed by hugepages.
> > > This can be enabled through the '-h' flag. This flag should only be used
> > > on systems where THP capabilities are enabled.
> > >
> > > This is motivated by a recent bug that was due to faulty handling of
> > > userspace buffers backed by hugepages. This patch is a mitigation
> > > against problems like this in the future.
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > ---
> >
> > Those two test cases fail on old system which doesn't support
> > MADV_COLLAPSE:
> >
> >    fsx -N 10000 -l 500000 -h
> >   -fsx -N 10000 -o 8192 -l 500000 -h
> >   -fsx -N 10000 -o 128000 -l 500000 -h
> >   +MADV_COLLAPSE not supported. Can't support -h
> >
> > and
> >
> >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> >   +mapped writes DISABLED
> >   +MADV_COLLAPSE not supported. Can't support -h
> >
> > I'm wondering ...
> >
> > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > index 41933354..3be383c6 100644
> > > --- a/ltp/fsx.c
> > > +++ b/ltp/fsx.c
> > > @@ -190,6 +190,16 @@ int      o_direct;                       /* -Z */
> > >  int  aio = 0;
> > >  int  uring = 0;
> > >  int  mark_nr = 0;
> > > +int  hugepages = 0;                  /* -h flag */
> > > +
> > > +/* Stores info needed to periodically collapse hugepages */
> > > +struct hugepages_collapse_info {
> > > +     void *orig_good_buf;
> > > +     long good_buf_size;
> > > +     void *orig_temp_buf;
> > > +     long temp_buf_size;
> > > +};
> > > +struct hugepages_collapse_info hugepages_info;
> > >
> > >  int page_size;
> > >  int page_mask;
> > > @@ -2471,7 +2481,7 @@ void
> > >  usage(void)
> > >  {
> > >       fprintf(stdout, "usage: %s",
> > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > @@ -2483,8 +2493,11 @@ usage(void)
> > >       -d: debug output for all operations\n\
> > >       -e: pollute post-eof on size changes (default 0)\n\
> > >       -f: flush and invalidate cache after I/O\n\
> > > -     -g X: write character X instead of random generated data\n\
> > > -     -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > +     -g X: write character X instead of random generated data\n"
> > > +#ifdef MADV_COLLAPSE
> > > +"    -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > +#endif
> > > +"    -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > >       -j logid: prefix debug log messsages with this id\n\
> > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > >       -l flen: the upper bound on file size (default 262144)\n\
> > > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str)
> > >  #endif
> > >  }
> > >
> > > +/*
> > > + * Reclaim may break up hugepages, so do a best-effort collapse every once in
> > > + * a while.
> > > + */
> > > +static void
> > > +collapse_hugepages(void)
> > > +{
> > > +#ifdef MADV_COLLAPSE
> > > +     int ret;
> > > +
> > > +     /* re-collapse every 16k fsxops after we start */
> > > +     if (!numops || (numops & ((1U << 14) - 1)))
> > > +             return;
> > > +
> > > +     ret = madvise(hugepages_info.orig_good_buf,
> > > +                   hugepages_info.good_buf_size, MADV_COLLAPSE);
> > > +     if (ret)
> > > +             prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
> > > +                  numops, strerror(errno));
> > > +     ret = madvise(hugepages_info.orig_temp_buf,
> > > +                   hugepages_info.temp_buf_size, MADV_COLLAPSE);
> > > +     if (ret)
> > > +             prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
> > > +                  numops, strerror(errno));
> > > +#endif
> > > +}
> > > +
> > >  bool
> > >  keep_running(void)
> > >  {
> > >       int ret;
> > >
> > > +     if (hugepages)
> > > +             collapse_hugepages();
> > > +
> > >       if (deadline.tv_nsec) {
> > >               struct timespec now;
> > >
> > > @@ -2856,6 +2899,103 @@ keep_running(void)
> > >       return numops-- != 0;
> > >  }
> > >
> > > +static long
> > > +get_hugepage_size(void)
> > > +{
> > > +     const char str[] = "Hugepagesize:";
> > > +     size_t str_len =  sizeof(str) - 1;
> > > +     unsigned int hugepage_size = 0;
> > > +     char buffer[64];
> > > +     FILE *file;
> > > +
> > > +     file = fopen("/proc/meminfo", "r");
> > > +     if (!file) {
> > > +             prterr("get_hugepage_size: fopen /proc/meminfo");
> > > +             return -1;
> > > +     }
> > > +     while (fgets(buffer, sizeof(buffer), file)) {
> > > +             if (strncmp(buffer, str, str_len) == 0) {
> > > +                     sscanf(buffer + str_len, "%u", &hugepage_size);
> > > +                     break;
> > > +             }
> > > +     }
> > > +     fclose(file);
> > > +     if (!hugepage_size) {
> > > +             prterr("get_hugepage_size: failed to find "
> > > +                     "hugepage size in /proc/meminfo\n");
> > > +             return -1;
> > > +     }
> > > +
> > > +     /* convert from KiB to bytes */
> > > +     return hugepage_size << 10;
> > > +}
> > > +
> > > +static void *
> > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
> > > +{
> > > +     void *buf = NULL;
> > > +#ifdef MADV_COLLAPSE
> > > +     int ret;
> > > +     long size = roundup(len, hugepage_size) + alignment;
> > > +
> > > +     ret = posix_memalign(&buf, hugepage_size, size);
> > > +     if (ret) {
> > > +             prterr("posix_memalign for buf");
> > > +             return NULL;
> > > +     }
> > > +     memset(buf, '\0', size);
> > > +     ret = madvise(buf, size, MADV_COLLAPSE);
> > > +     if (ret) {
> > > +             prterr("madvise collapse for buf");
> > > +             free(buf);
> > > +             return NULL;
> > > +     }
> > > +
> > > +     *buf_size = size;
> > > +#endif
> > > +     return buf;
> > > +}
> > > +
> > > +static void
> > > +init_buffers(void)
> > > +{
> > > +     int i;
> > > +
> > > +     original_buf = (char *) malloc(maxfilelen);
> > > +     for (i = 0; i < maxfilelen; i++)
> > > +             original_buf[i] = random() % 256;
> > > +     if (hugepages) {
> > > +             long hugepage_size = get_hugepage_size();
> > > +             if (hugepage_size == -1) {
> > > +                     prterr("get_hugepage_size()");
> > > +                     exit(102);
> > > +             }
> > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
> > > +                                           &hugepages_info.good_buf_size);
> > > +             if (!good_buf) {
> > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > +                     exit(103);
> > > +             }
> > > +             hugepages_info.orig_good_buf = good_buf;
> > > +
> > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
> > > +                                           &hugepages_info.temp_buf_size);
> > > +             if (!temp_buf) {
> > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > +                     exit(103);
> > > +             }
> > > +             hugepages_info.orig_temp_buf = temp_buf;
> > > +     } else {
> > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > +
> > > +             good_buf = calloc(1, good_buf_len);
> > > +             temp_buf = calloc(1, temp_buf_len);
> > > +     }
> > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > +}
> > > +
> > >  static struct option longopts[] = {
> > >       {"replay-ops", required_argument, 0, 256},
> > >       {"record-ops", optional_argument, 0, 255},
> > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > >
> > >       while ((ch = getopt_long(argc, argv,
> > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > >                                longopts, NULL)) != EOF)
> > >               switch (ch) {
> > >               case 'b':
> > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > >               case 'g':
> > >                       filldata = *optarg;
> > >                       break;
> > > +             case 'h':
> > > +#ifndef MADV_COLLAPSE
> > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > +                             "Can't support -h\n");
> > > +                     exit(86);
> > > +#endif
> > > +                     hugepages = 1;
> > > +                     break;
> >
> > ...
> > if we could change this part to:
> >
> > #ifdef MADV_COLLAPSE
> >         hugepages = 1;
> > #endif
> >         break;
> >
> > to avoid the test failures on old systems.
> >
> > Or any better ideas from you :)
> 
> Hi Zorro,
> 
> It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> do you think about skipping generic/758 and generic/759 if the kernel
> version is older than 6.1? That to me seems more preferable than the
> paste above, as the paste above would execute the test as if it did
> test hugepages when in reality it didn't, which would be misleading.

Hi Joanne,

Sorry I'm still on my holiday, reply a bit late and hastily. At first,
your patch has been merged, the merged case numbers are g/759 and g/760,
you can rebase to latest for-next branch and write new fix patch :)

Then, you're right, above code change is bit rough;) Maybe there's a way
to _notrun if MADV_COLLAPSE isn't supported?

Thanks,
Zorro

> 
> 
> Thanks,
> Joanne
> 
> >
> > Thanks,
> > Zorro
> >
> > >               case 'i':
> > >                       integrity = 1;
> > >                       logdev = strdup(optarg);
> > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> > >                       exit(95);
> > >               }
> > >       }
> > > -     original_buf = (char *) malloc(maxfilelen);
> > > -     for (i = 0; i < maxfilelen; i++)
> > > -             original_buf[i] = random() % 256;
> > > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > -     memset(good_buf, '\0', maxfilelen);
> > > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > -     memset(temp_buf, '\0', maxoplen);
> > > +     init_buffers();
> > >       if (lite) {     /* zero entire existing file */
> > >               ssize_t written;
> > >
> > > --
> > > 2.47.1
> > >
> >
>
Joanne Koong Feb. 4, 2025, 8:50 p.m. UTC | #12
On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote:
> > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote:
> > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > >
> > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > > >
> > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > > > > >
> > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > > > > against problems like this in the future.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > > > > ---
> > > > > > > > > >
> > > > > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > > > > MADV_COLLAPSE:
> > > > > > > > > >
> > > > > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > >
> > > > > > > > > > and
> > > > > > > > > >
> > > > > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > >   +mapped writes DISABLED
> > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > >
> > > > > > > > > > I'm wondering ...
> > > > > > > > > >
> > > > > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > > > > --- a/ltp/fsx.c
> > > > > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > > > >  static struct option longopts[] = {
> > > > > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > > > > >
> > > > > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > > > > >               switch (ch) {
> > > > > > > > > > >               case 'b':
> > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > > > > >               case 'g':
> > > > > > > > > > >                       filldata = *optarg;
> > > > > > > > > > >                       break;
> > > > > > > > > > > +             case 'h':
> > > > > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > > > > +                             "Can't support -h\n");
> > > > > > > > > > > +                     exit(86);
> > > > > > > > > > > +#endif
> > > > > > > > > > > +                     hugepages = 1;
> > > > > > > > > > > +                     break;
> > > > > > > > > >
> > > > > > > > > > ...
> > > > > > > > > > if we could change this part to:
> > > > > > > > > >
> > > > > > > > > > #ifdef MADV_COLLAPSE
> > > > > > > > > >         hugepages = 1;
> > > > > > > > > > #endif
> > > > > > > > > >         break;
> > > > > > > > > >
> > > > > > > > > > to avoid the test failures on old systems.
> > > > > > > > > >
> > > > > > > > > > Or any better ideas from you :)
> > > > > > > > >
> > > > > > > > > Hi Zorro,
> > > > > > > > >
> > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > > > > >
> > > > > > > > Now that I've gotten to try this out --
> > > > > > > >
> > > > > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > > > > >
> > > > > > > > The second thing is that userspace programs can ensure the existence of
> > > > > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > > > > which requires that the underlying C library headers are new enough to
> > > > > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > > > > in a way that you hope is the same as...
> > > > > > > >
> > > > > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > > > > headers package; and the third way is for the program to define it
> > > > > > > > itself if nobody else does.
> > > > > > > >
> > > > > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > > > > >
> > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > > > > >
> > > > > > Yes, because glibc provides the mmap() function that wraps
> > > > > > syscall(__NR_mmap, ...);
> > > > > >
> > > > > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > > > > headers package and support the feature.
> > > > > >
> > > > > > No, because some (most?) vendors backport new features into existing
> > > > > > kernels without revving the version number of that kernel.
> > > > >
> > > > > Oh okay, I see. That makes sense, thanks for the explanation.
> > > > >
> > > > > >
> > > > > > Maybe the following fixes things?
> > > > > >
> > > > > > --D
> > > > > >
> > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > > > > >
> > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > > > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > > > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > > > > the test will fail with:
> > > > > >
> > > > > > >  QA output created by 760
> > > > > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > +mapped writes DISABLED
> > > > > > > +MADV_COLLAPSE not supported. Can't support -h
> > > > > >
> > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > > > > kernel is newer than the rest of userspace.
> > > > > >
> > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > > > > ---
> > > > > >  ltp/fsx.c         |    1 +
> > > > > >  tests/generic/759 |    3 +++
> > > > > >  tests/generic/760 |    3 +++
> > > > > >  3 files changed, 7 insertions(+)
> > > > > >
> > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > > > > --- a/ltp/fsx.c
> > > > > > +++ b/ltp/fsx.c
> > > > > > @@ -20,6 +20,7 @@
> > > > > >  #include <strings.h>
> > > > > >  #include <sys/file.h>
> > > > > >  #include <sys/mman.h>
> > > > > > +#include <linux/mman.h>
> > > > > >  #include <sys/uio.h>
> > > > > >  #include <stdbool.h>
> > > > > >  #ifdef HAVE_ERR_H
> > > > > > diff --git a/tests/generic/759 b/tests/generic/759
> > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > > > > --- a/tests/generic/759
> > > > > > +++ b/tests/generic/759
> > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > > > > >  _require_test
> > > > > >  _require_thp
> > > > > >
> > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > +
> > > > > >  run_fsx -N 10000            -l 500000 -h
> > > > > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > > > > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > > > > diff --git a/tests/generic/760 b/tests/generic/760
> > > > > > index c71a196222ad3b..2fbd28502ae678 100755
> > > > > > --- a/tests/generic/760
> > > > > > +++ b/tests/generic/760
> > > > > > @@ -15,6 +15,9 @@ _require_test
> > > > > >  _require_odirect
> > > > > >  _require_thp
> > > > > >
> > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > +
> > > > >
> > > > > I tried this out locally and it works for me:
> > > > >
> > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > > > > Ran: generic/759
> > > > > Not run: generic/759
> > > > > Passed all 1 tests
> > > > >
> > > > > SECTION       -- fuse
> > > > > =========================
> > > > > Ran: generic/759
> > > > > Not run: generic/759
> > > > > Passed all 1 tests
> > > >
> > > > Does the test actually run if you /do/ have kernel/libc headers that
> > > > define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
> > > >
> > >
> > > I'm not sure if I fully understand the subtext of your question but
> > > yes, the test runs on my system when MADV_COLLAPSE is defined in the
> > > kernel/libc headers.
> >
> > Yep, you understood me correctly. :)
> >
> > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
> > > I tried this out by modifying the ifdef/ifndef checks in fsx to
> > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
> > > grep -q 'MADV_COLLAPSE not supported'' line catches that.
> >
> > <nod> Sounds good then; I'll add this to my test clod and run it
> > overnight.
>
> The arm64 vms with 64k base pages spat out this:
>
> --- /run/fstests/bin/tests/generic/759.out      2025-02-02 08:36:28.007248055 -0800
> +++ /var/tmp/fstests/generic/759.out.bad        2025-02-03 16:51:34.862964640 -0800
> @@ -1,4 +1,5 @@
>  QA output created by 759
>  fsx -N 10000 -l 500000 -h
> -fsx -N 10000 -o 8192 -l 500000 -h
> -fsx -N 10000 -o 128000 -l 500000 -h
> +Seed set to 1
> +madvise collapse for buf: Cannot allocate memory
> +init_hugepages_buf failed for good_buf: Cannot allocate memory
>
> Note that it was trying to create a 512M page(!) on a VM with 8G of
> memory.  Normally one doesn't use large base page size in low memory
> environments, but this /is/ fstestsland. 8-)
>
> From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync
> hugepage collapse") :
>
>         ENOMEM  Memory allocation failed or VMA not found
>         EBUSY   Memcg charging failed
>         EAGAIN  Required resource temporarily unavailable.  Try again
>                 might succeed.
>         EINVAL  Other error: No PMD found, subpage doesn't have Present
>                 bit set, "Special" page no backed by struct page, VMA
>                 incorrectly sized, address not page-aligned, ...
>
> It sounds like ENOMEM/EBUSY/EINVAL should result in
> _notrun "Could not generate hugepage" ?  What are your thoughts?
>

Thanks for running it overnight. This sounds good to me, but will this
be robust against false negatives? For example, if it succeeds when
we're doing the

$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not
supported' &&  _notrun "fsx binary does not support MADV_COLLAPSE"

check, does that guarantee that the actual run won't error out with
ENOMEM/EBUSY/EINVAL? It seems like there could be the case where it
passes the check but then when it actually runs, the system state
memory state may have changed and now memcg charging or the memory
allocation fails? EAGAIN seems a bit iffy to me - hopefully this
doesn't happen often but if it does, it would likely be a false
negative fail for the generic test?

Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q
'MADV_COLLAPSE not supported' &&  _notrun "fsx binary does not support
MADV_COLLAPSE"", we should run the test and then if the output to the
.out file is "MADV_COLLAPSE not supported", then we deem that a
pass/success? Not sure if this is possible in the fstest
infrastructure though for checking against two possible outputs in the
.out file. What are your thoughts?


Thanks,
Joanne

> --D
>
> > --D
> >
> > >
> > > > --D
> > > >
> > > > >
> > > > > Thanks,
> > > > > Joanne
> > > > >
> > > > > >  psize=`$here/src/feature -s`
> > > > > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > > > > >
> > > > >
> >
Joanne Koong Feb. 4, 2025, 8:54 p.m. UTC | #13
On Tue, Feb 4, 2025 at 9:05 AM Zorro Lang <zlang@redhat.com> wrote:
>
> On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > >
> > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > Add support for reads/writes from buffers backed by hugepages.
> > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > on systems where THP capabilities are enabled.
> > > >
> > > > This is motivated by a recent bug that was due to faulty handling of
> > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > against problems like this in the future.
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > ---
> > >
> > > Those two test cases fail on old system which doesn't support
> > > MADV_COLLAPSE:
> > >
> > >    fsx -N 10000 -l 500000 -h
> > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > >   +MADV_COLLAPSE not supported. Can't support -h
> > >
> > > and
> > >
> > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > >   +mapped writes DISABLED
> > >   +MADV_COLLAPSE not supported. Can't support -h
> > >
> > > I'm wondering ...
> > >
> > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > index 41933354..3be383c6 100644
> > > > --- a/ltp/fsx.c
> > > > +++ b/ltp/fsx.c
> > > > @@ -190,6 +190,16 @@ int      o_direct;                       /* -Z */
> > > > +
> > > >  static struct option longopts[] = {
> > > >       {"replay-ops", required_argument, 0, 256},
> > > >       {"record-ops", optional_argument, 0, 255},
> > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > >
> > > >       while ((ch = getopt_long(argc, argv,
> > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > >                                longopts, NULL)) != EOF)
> > > >               switch (ch) {
> > > >               case 'b':
> > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > >               case 'g':
> > > >                       filldata = *optarg;
> > > >                       break;
> > > > +             case 'h':
> > > > +#ifndef MADV_COLLAPSE
> > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > +                             "Can't support -h\n");
> > > > +                     exit(86);
> > > > +#endif
> > > > +                     hugepages = 1;
> > > > +                     break;
> > >
> > > ...
> > > if we could change this part to:
> > >
> > > #ifdef MADV_COLLAPSE
> > >         hugepages = 1;
> > > #endif
> > >         break;
> > >
> > > to avoid the test failures on old systems.
> > >
> > > Or any better ideas from you :)
> >
> > Hi Zorro,
> >
> > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > do you think about skipping generic/758 and generic/759 if the kernel
> > version is older than 6.1? That to me seems more preferable than the
> > paste above, as the paste above would execute the test as if it did
> > test hugepages when in reality it didn't, which would be misleading.
>
> Hi Joanne,
>
> Sorry I'm still on my holiday, reply a bit late and hastily. At first,
> your patch has been merged, the merged case numbers are g/759 and g/760,
> you can rebase to latest for-next branch and write new fix patch :)

Hi Zorro,

No worries at all. Thanks for your work on this.

>
> Then, you're right, above code change is bit rough;) Maybe there's a way
> to _notrun if MADV_COLLAPSE isn't supported?

I'll (or Darrick if we go with his solution) make sure to submit a fix
for this so that it doesn't break the older systems.

Thanks,
Joanne

>
> Thanks,
> Zorro
>
> >
> >
> > Thanks,
> > Joanne
> >
> > >
> > > Thanks,
> > > Zorro
> > >
> > > >               case 'i':
> > > >                       integrity = 1;
> > > >                       logdev = strdup(optarg);
> > > > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> > > >                       exit(95);
> > > >               }
> > > >       }
> > > > -     original_buf = (char *) malloc(maxfilelen);
> > > > -     for (i = 0; i < maxfilelen; i++)
> > > > -             original_buf[i] = random() % 256;
> > > > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > > > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > -     memset(good_buf, '\0', maxfilelen);
> > > > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > > > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > -     memset(temp_buf, '\0', maxoplen);
> > > > +     init_buffers();
> > > >       if (lite) {     /* zero entire existing file */
> > > >               ssize_t written;
> > > >
> > > > --
> > > > 2.47.1
> > > >
> > >
> >
>
Darrick J. Wong Feb. 4, 2025, 9:31 p.m. UTC | #14
On Tue, Feb 04, 2025 at 12:50:04PM -0800, Joanne Koong wrote:
> On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote:
> > > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote:
> > > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > >
> > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > >
> > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > > > > > >
> > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > > > > > against problems like this in the future.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > > > > > ---
> > > > > > > > > > >
> > > > > > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > > > > > MADV_COLLAPSE:
> > > > > > > > > > >
> > > > > > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > > >
> > > > > > > > > > > and
> > > > > > > > > > >
> > > > > > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > >   +mapped writes DISABLED
> > > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > > >
> > > > > > > > > > > I'm wondering ...
> > > > > > > > > > >
> > > > > > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > > > > > --- a/ltp/fsx.c
> > > > > > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > > > > >  static struct option longopts[] = {
> > > > > > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > > > > > >
> > > > > > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > > > > > >               switch (ch) {
> > > > > > > > > > > >               case 'b':
> > > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > > > > > >               case 'g':
> > > > > > > > > > > >                       filldata = *optarg;
> > > > > > > > > > > >                       break;
> > > > > > > > > > > > +             case 'h':
> > > > > > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > > > > > +                             "Can't support -h\n");
> > > > > > > > > > > > +                     exit(86);
> > > > > > > > > > > > +#endif
> > > > > > > > > > > > +                     hugepages = 1;
> > > > > > > > > > > > +                     break;
> > > > > > > > > > >
> > > > > > > > > > > ...
> > > > > > > > > > > if we could change this part to:
> > > > > > > > > > >
> > > > > > > > > > > #ifdef MADV_COLLAPSE
> > > > > > > > > > >         hugepages = 1;
> > > > > > > > > > > #endif
> > > > > > > > > > >         break;
> > > > > > > > > > >
> > > > > > > > > > > to avoid the test failures on old systems.
> > > > > > > > > > >
> > > > > > > > > > > Or any better ideas from you :)
> > > > > > > > > >
> > > > > > > > > > Hi Zorro,
> > > > > > > > > >
> > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > > > > > >
> > > > > > > > > Now that I've gotten to try this out --
> > > > > > > > >
> > > > > > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > > > > > >
> > > > > > > > > The second thing is that userspace programs can ensure the existence of
> > > > > > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > > > > > which requires that the underlying C library headers are new enough to
> > > > > > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > > > > > in a way that you hope is the same as...
> > > > > > > > >
> > > > > > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > > > > > headers package; and the third way is for the program to define it
> > > > > > > > > itself if nobody else does.
> > > > > > > > >
> > > > > > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > > > > > >
> > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > > > > > >
> > > > > > > Yes, because glibc provides the mmap() function that wraps
> > > > > > > syscall(__NR_mmap, ...);
> > > > > > >
> > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > > > > > headers package and support the feature.
> > > > > > >
> > > > > > > No, because some (most?) vendors backport new features into existing
> > > > > > > kernels without revving the version number of that kernel.
> > > > > >
> > > > > > Oh okay, I see. That makes sense, thanks for the explanation.
> > > > > >
> > > > > > >
> > > > > > > Maybe the following fixes things?
> > > > > > >
> > > > > > > --D
> > > > > > >
> > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > > > > > >
> > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > > > > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > > > > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > > > > > the test will fail with:
> > > > > > >
> > > > > > > >  QA output created by 760
> > > > > > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > +mapped writes DISABLED
> > > > > > > > +MADV_COLLAPSE not supported. Can't support -h
> > > > > > >
> > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > > > > > kernel is newer than the rest of userspace.
> > > > > > >
> > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > > > > > ---
> > > > > > >  ltp/fsx.c         |    1 +
> > > > > > >  tests/generic/759 |    3 +++
> > > > > > >  tests/generic/760 |    3 +++
> > > > > > >  3 files changed, 7 insertions(+)
> > > > > > >
> > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > > > > > --- a/ltp/fsx.c
> > > > > > > +++ b/ltp/fsx.c
> > > > > > > @@ -20,6 +20,7 @@
> > > > > > >  #include <strings.h>
> > > > > > >  #include <sys/file.h>
> > > > > > >  #include <sys/mman.h>
> > > > > > > +#include <linux/mman.h>
> > > > > > >  #include <sys/uio.h>
> > > > > > >  #include <stdbool.h>
> > > > > > >  #ifdef HAVE_ERR_H
> > > > > > > diff --git a/tests/generic/759 b/tests/generic/759
> > > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > > > > > --- a/tests/generic/759
> > > > > > > +++ b/tests/generic/759
> > > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > > > > > >  _require_test
> > > > > > >  _require_thp
> > > > > > >
> > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > > +
> > > > > > >  run_fsx -N 10000            -l 500000 -h
> > > > > > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > > > > > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > > > > > diff --git a/tests/generic/760 b/tests/generic/760
> > > > > > > index c71a196222ad3b..2fbd28502ae678 100755
> > > > > > > --- a/tests/generic/760
> > > > > > > +++ b/tests/generic/760
> > > > > > > @@ -15,6 +15,9 @@ _require_test
> > > > > > >  _require_odirect
> > > > > > >  _require_thp
> > > > > > >
> > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > > +
> > > > > >
> > > > > > I tried this out locally and it works for me:
> > > > > >
> > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > > > > > Ran: generic/759
> > > > > > Not run: generic/759
> > > > > > Passed all 1 tests
> > > > > >
> > > > > > SECTION       -- fuse
> > > > > > =========================
> > > > > > Ran: generic/759
> > > > > > Not run: generic/759
> > > > > > Passed all 1 tests
> > > > >
> > > > > Does the test actually run if you /do/ have kernel/libc headers that
> > > > > define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
> > > > >
> > > >
> > > > I'm not sure if I fully understand the subtext of your question but
> > > > yes, the test runs on my system when MADV_COLLAPSE is defined in the
> > > > kernel/libc headers.
> > >
> > > Yep, you understood me correctly. :)
> > >
> > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
> > > > I tried this out by modifying the ifdef/ifndef checks in fsx to
> > > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
> > > > grep -q 'MADV_COLLAPSE not supported'' line catches that.
> > >
> > > <nod> Sounds good then; I'll add this to my test clod and run it
> > > overnight.
> >
> > The arm64 vms with 64k base pages spat out this:
> >
> > --- /run/fstests/bin/tests/generic/759.out      2025-02-02 08:36:28.007248055 -0800
> > +++ /var/tmp/fstests/generic/759.out.bad        2025-02-03 16:51:34.862964640 -0800
> > @@ -1,4 +1,5 @@
> >  QA output created by 759
> >  fsx -N 10000 -l 500000 -h
> > -fsx -N 10000 -o 8192 -l 500000 -h
> > -fsx -N 10000 -o 128000 -l 500000 -h
> > +Seed set to 1
> > +madvise collapse for buf: Cannot allocate memory
> > +init_hugepages_buf failed for good_buf: Cannot allocate memory
> >
> > Note that it was trying to create a 512M page(!) on a VM with 8G of
> > memory.  Normally one doesn't use large base page size in low memory
> > environments, but this /is/ fstestsland. 8-)
> >
> > From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync
> > hugepage collapse") :
> >
> >         ENOMEM  Memory allocation failed or VMA not found
> >         EBUSY   Memcg charging failed
> >         EAGAIN  Required resource temporarily unavailable.  Try again
> >                 might succeed.
> >         EINVAL  Other error: No PMD found, subpage doesn't have Present
> >                 bit set, "Special" page no backed by struct page, VMA
> >                 incorrectly sized, address not page-aligned, ...
> >
> > It sounds like ENOMEM/EBUSY/EINVAL should result in
> > _notrun "Could not generate hugepage" ?  What are your thoughts?
> >
> 
> Thanks for running it overnight. This sounds good to me, but will this
> be robust against false negatives? For example, if it succeeds when
> we're doing the
> 
> $here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not
> supported' &&  _notrun "fsx binary does not support MADV_COLLAPSE"
> 
> check, does that guarantee that the actual run won't error out with
> ENOMEM/EBUSY/EINVAL?

Nope.  That code only checks that the fsx binary was built with
MADV_COLLAPSE support, not that any of it actually works.

>                      It seems like there could be the case where it
> passes the check but then when it actually runs, the system state
> memory state may have changed and now memcg charging or the memory
> allocation fails?

Indeed, I think the error I saw is exactly this happening.

In the end I turned the fsx -h invocation into a helper that checks for
any of those three errors (ENOMEM/BUSY/INVAL) and _notruns the test if
we can't even get a hugepage at the start.

# Run fsx with -h(ugepage buffers).  If we can't set up a hugepage then skip
# the test, but if any other error occurs then exit the test.
_run_hugepage_fsx() {
	_run_fsx "$@" -h &> $tmp.hugepage_fsx
	local res=$?
	if [ $res -eq 103 ]; then
		# According to the MADV_COLLAPSE manpage, these three errors
		# can happen if the kernel could not collapse a collection of
		# pages into a single huge page.
		grep -q -E ' for hugebuf: (Cannot allocate memory|Device or resource busy|Resource temporarily unavailable)' $tmp.hugepage_fsx && \
			_notrun "Could not set up huge page for test"
	fi
	cat $tmp.hugepage_fsx
	rm -f $tmp.hugepage_fsx
	test $res -ne 0 && exit 1
	return 0
}

Note that it won't prevent or paper over subsequent MADV_COLLAPSE
failures once the process is up and running, though as long as nothing
else in fsx fails or corrupts the file, the collapse failures won't be
reported.  (As is the case now).

Anyway I'll send patches for both issues.

> allocation fails? EAGAIN seems a bit iffy to me - hopefully this
> doesn't happen often but if it does, it would likely be a false
> negative fail for the generic test?
> 
> Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q
> 'MADV_COLLAPSE not supported' &&  _notrun "fsx binary does not support
> MADV_COLLAPSE"", we should run the test and then if the output to the
> .out file is "MADV_COLLAPSE not supported", then we deem that a
> pass/success? Not sure if this is possible in the fstest
> infrastructure though for checking against two possible outputs in the
> .out file. What are your thoughts?

You ... can switch the .out(put) dynamically, but the mechanism to do it
is underdocumented and quirky; and it makes understanding what the test
does rather difficult.  See _link_out_file in xfs/614 if you want to
open that Pandora's box.

--D

> 
> 
> Thanks,
> Joanne
> 
> > --D
> >
> > > --D
> > >
> > > >
> > > > > --D
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Joanne
> > > > > >
> > > > > > >  psize=`$here/src/feature -s`
> > > > > > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > > > > > >
> > > > > >
> > >
>
Joanne Koong Feb. 4, 2025, 9:52 p.m. UTC | #15
On Tue, Feb 4, 2025 at 1:31 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Tue, Feb 04, 2025 at 12:50:04PM -0800, Joanne Koong wrote:
> > On Mon, Feb 3, 2025 at 8:21 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Mon, Feb 03, 2025 at 01:54:32PM -0800, Darrick J. Wong wrote:
> > > > On Mon, Feb 03, 2025 at 01:40:39PM -0800, Joanne Koong wrote:
> > > > > On Mon, Feb 3, 2025 at 12:01 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > >
> > > > > > On Mon, Feb 03, 2025 at 11:57:23AM -0800, Joanne Koong wrote:
> > > > > > > On Mon, Feb 3, 2025 at 11:41 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > > >
> > > > > > > > On Mon, Feb 03, 2025 at 11:23:20AM -0800, Joanne Koong wrote:
> > > > > > > > > On Mon, Feb 3, 2025 at 10:59 AM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Feb 03, 2025 at 10:04:04AM -0800, Joanne Koong wrote:
> > > > > > > > > > > On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@redhat.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > > > > > > > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > > > > > > > on systems where THP capabilities are enabled.
> > > > > > > > > > > > >
> > > > > > > > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > > > > > > > against problems like this in the future.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > > > > > > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > > > > > > > > > > > ---
> > > > > > > > > > > >
> > > > > > > > > > > > Those two test cases fail on old system which doesn't support
> > > > > > > > > > > > MADV_COLLAPSE:
> > > > > > > > > > > >
> > > > > > > > > > > >    fsx -N 10000 -l 500000 -h
> > > > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -h
> > > > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -h
> > > > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > > > >
> > > > > > > > > > > > and
> > > > > > > > > > > >
> > > > > > > > > > > >    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > > >   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > > >   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > > > >   +mapped writes DISABLED
> > > > > > > > > > > >   +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > > > > > >
> > > > > > > > > > > > I'm wondering ...
> > > > > > > > > > > >
> > > > > > > > > > > > >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > > > > > > > >  1 file changed, 153 insertions(+), 13 deletions(-)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > > > > > > index 41933354..3be383c6 100644
> > > > > > > > > > > > > --- a/ltp/fsx.c
> > > > > > > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > > > > > >  static struct option longopts[] = {
> > > > > > > > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > > > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > > > > > > > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> > > > > > > > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > > > > > > > >
> > > > > > > > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > > > > > > >                                longopts, NULL)) != EOF)
> > > > > > > > > > > > >               switch (ch) {
> > > > > > > > > > > > >               case 'b':
> > > > > > > > > > > > > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> > > > > > > > > > > > >               case 'g':
> > > > > > > > > > > > >                       filldata = *optarg;
> > > > > > > > > > > > >                       break;
> > > > > > > > > > > > > +             case 'h':
> > > > > > > > > > > > > +#ifndef MADV_COLLAPSE
> > > > > > > > > > > > > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > > > > > > > > > > > > +                             "Can't support -h\n");
> > > > > > > > > > > > > +                     exit(86);
> > > > > > > > > > > > > +#endif
> > > > > > > > > > > > > +                     hugepages = 1;
> > > > > > > > > > > > > +                     break;
> > > > > > > > > > > >
> > > > > > > > > > > > ...
> > > > > > > > > > > > if we could change this part to:
> > > > > > > > > > > >
> > > > > > > > > > > > #ifdef MADV_COLLAPSE
> > > > > > > > > > > >         hugepages = 1;
> > > > > > > > > > > > #endif
> > > > > > > > > > > >         break;
> > > > > > > > > > > >
> > > > > > > > > > > > to avoid the test failures on old systems.
> > > > > > > > > > > >
> > > > > > > > > > > > Or any better ideas from you :)
> > > > > > > > > > >
> > > > > > > > > > > Hi Zorro,
> > > > > > > > > > >
> > > > > > > > > > > It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
> > > > > > > > > > > do you think about skipping generic/758 and generic/759 if the kernel
> > > > > > > > > > > version is older than 6.1? That to me seems more preferable than the
> > > > > > > > > > > paste above, as the paste above would execute the test as if it did
> > > > > > > > > > > test hugepages when in reality it didn't, which would be misleading.
> > > > > > > > > >
> > > > > > > > > > Now that I've gotten to try this out --
> > > > > > > > > >
> > > > > > > > > > There's a couple of things going on here.  The first is that generic/759
> > > > > > > > > > and 760 need to check if invoking fsx -h causes it to spit out the
> > > > > > > > > > "MADV_COLLAPSE not supported" error and _notrun the test.
> > > > > > > > > >
> > > > > > > > > > The second thing is that userspace programs can ensure the existence of
> > > > > > > > > > MADV_COLLAPSE in multiple ways.  The first way is through sys/mman.h,
> > > > > > > > > > which requires that the underlying C library headers are new enough to
> > > > > > > > > > include a definition.  glibc 2.37 is new enough, but even things like
> > > > > > > > > > Debian 12 and RHEL 9 aren't new enough to have that.  Other C libraries
> > > > > > > > > > might not follow glibc's practice of wrapping and/or redefining symbols
> > > > > > > > > > in a way that you hope is the same as...
> > > > > > > > > >
> > > > > > > > > > The second way is through linux/mman.h, which comes from the kernel
> > > > > > > > > > headers package; and the third way is for the program to define it
> > > > > > > > > > itself if nobody else does.
> > > > > > > > > >
> > > > > > > > > > So I think the easiest way to fix the fsx.c build is to include
> > > > > > > > > > linux/mman.h in addition to sys/mman.h.  Sorry I didn't notice these
> > > > > > > > >
> > > > > > > > > Thanks for your input. Do we still need sys/mman.h if linux/mman.h is added?
> > > > > > > >
> > > > > > > > Yes, because glibc provides the mmap() function that wraps
> > > > > > > > syscall(__NR_mmap, ...);
> > > > > > > >
> > > > > > > > > For generic/758 and 759, does it suffice to gate this on whether the
> > > > > > > > > kernel version if 6.1+ and _notrun if not? My understanding is that
> > > > > > > > > any kernel version 6.1 or newer will have MADV_COLLAPSE in its kernel
> > > > > > > > > headers package and support the feature.
> > > > > > > >
> > > > > > > > No, because some (most?) vendors backport new features into existing
> > > > > > > > kernels without revving the version number of that kernel.
> > > > > > >
> > > > > > > Oh okay, I see. That makes sense, thanks for the explanation.
> > > > > > >
> > > > > > > >
> > > > > > > > Maybe the following fixes things?
> > > > > > > >
> > > > > > > > --D
> > > > > > > >
> > > > > > > > generic/759,760: fix MADV_COLLAPSE detection and inclusion
> > > > > > > >
> > > > > > > > On systems with "old" C libraries such as glibc 2.36 in Debian 12, the
> > > > > > > > MADV_COLLAPSE flag might not be defined in any of the header files
> > > > > > > > pulled in by sys/mman.h, which means that the fsx binary might not get
> > > > > > > > built with any of the MADV_COLLAPSE code.  If the kernel supports THP,
> > > > > > > > the test will fail with:
> > > > > > > >
> > > > > > > > >  QA output created by 760
> > > > > > > > >  fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
> > > > > > > > > +mapped writes DISABLED
> > > > > > > > > +MADV_COLLAPSE not supported. Can't support -h
> > > > > > > >
> > > > > > > > Fix both tests to detect fsx binaries that don't support MADV_COLLAPSE,
> > > > > > > > then fix fsx.c to include the mman.h from the kernel headers (aka
> > > > > > > > linux/mman.h) so that we can actually test IOs to and from THPs if the
> > > > > > > > kernel is newer than the rest of userspace.
> > > > > > > >
> > > > > > > > Cc: <fstests@vger.kernel.org> # v2025.02.02
> > > > > > > > Fixes: 627289232371e3 ("generic: add tests for read/writes from hugepages-backed buffers")
> > > > > > > > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > > > > > > > ---
> > > > > > > >  ltp/fsx.c         |    1 +
> > > > > > > >  tests/generic/759 |    3 +++
> > > > > > > >  tests/generic/760 |    3 +++
> > > > > > > >  3 files changed, 7 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > > index 634c496ffe9317..cf9502a74c17a7 100644
> > > > > > > > --- a/ltp/fsx.c
> > > > > > > > +++ b/ltp/fsx.c
> > > > > > > > @@ -20,6 +20,7 @@
> > > > > > > >  #include <strings.h>
> > > > > > > >  #include <sys/file.h>
> > > > > > > >  #include <sys/mman.h>
> > > > > > > > +#include <linux/mman.h>
> > > > > > > >  #include <sys/uio.h>
> > > > > > > >  #include <stdbool.h>
> > > > > > > >  #ifdef HAVE_ERR_H
> > > > > > > > diff --git a/tests/generic/759 b/tests/generic/759
> > > > > > > > index 6c74478aa8a0e0..3549c5ed6a9299 100755
> > > > > > > > --- a/tests/generic/759
> > > > > > > > +++ b/tests/generic/759
> > > > > > > > @@ -14,6 +14,9 @@ _begin_fstest rw auto quick
> > > > > > > >  _require_test
> > > > > > > >  _require_thp
> > > > > > > >
> > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > > > +
> > > > > > > >  run_fsx -N 10000            -l 500000 -h
> > > > > > > >  run_fsx -N 10000  -o 8192   -l 500000 -h
> > > > > > > >  run_fsx -N 10000  -o 128000 -l 500000 -h
> > > > > > > > diff --git a/tests/generic/760 b/tests/generic/760
> > > > > > > > index c71a196222ad3b..2fbd28502ae678 100755
> > > > > > > > --- a/tests/generic/760
> > > > > > > > +++ b/tests/generic/760
> > > > > > > > @@ -15,6 +15,9 @@ _require_test
> > > > > > > >  _require_odirect
> > > > > > > >  _require_thp
> > > > > > > >
> > > > > > > > +$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not supported' && \
> > > > > > > > +       _notrun "fsx binary does not support MADV_COLLAPSE"
> > > > > > > > +
> > > > > > >
> > > > > > > I tried this out locally and it works for me:
> > > > > > >
> > > > > > > generic/759 8s ... [not run] fsx binary does not support MADV_COLLAPSE
> > > > > > > Ran: generic/759
> > > > > > > Not run: generic/759
> > > > > > > Passed all 1 tests
> > > > > > >
> > > > > > > SECTION       -- fuse
> > > > > > > =========================
> > > > > > > Ran: generic/759
> > > > > > > Not run: generic/759
> > > > > > > Passed all 1 tests
> > > > > >
> > > > > > Does the test actually run if you /do/ have kernel/libc headers that
> > > > > > define MADV_COLLAPSE?  And if so, does that count as a Tested-by?
> > > > > >
> > > > >
> > > > > I'm not sure if I fully understand the subtext of your question but
> > > > > yes, the test runs on my system when MADV_COLLAPSE is defined in the
> > > > > kernel/libc headers.
> > > >
> > > > Yep, you understood me correctly. :)
> > > >
> > > > > For sanity checking the inverse case, (eg MADV_COLLAPSE not defined),
> > > > > I tried this out by modifying the ifdef/ifndef checks in fsx to
> > > > > MADV_COLLAPSE_ to see if the '$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 |
> > > > > grep -q 'MADV_COLLAPSE not supported'' line catches that.
> > > >
> > > > <nod> Sounds good then; I'll add this to my test clod and run it
> > > > overnight.
> > >
> > > The arm64 vms with 64k base pages spat out this:
> > >
> > > --- /run/fstests/bin/tests/generic/759.out      2025-02-02 08:36:28.007248055 -0800
> > > +++ /var/tmp/fstests/generic/759.out.bad        2025-02-03 16:51:34.862964640 -0800
> > > @@ -1,4 +1,5 @@
> > >  QA output created by 759
> > >  fsx -N 10000 -l 500000 -h
> > > -fsx -N 10000 -o 8192 -l 500000 -h
> > > -fsx -N 10000 -o 128000 -l 500000 -h
> > > +Seed set to 1
> > > +madvise collapse for buf: Cannot allocate memory
> > > +init_hugepages_buf failed for good_buf: Cannot allocate memory
> > >
> > > Note that it was trying to create a 512M page(!) on a VM with 8G of
> > > memory.  Normally one doesn't use large base page size in low memory
> > > environments, but this /is/ fstestsland. 8-)
> > >
> > > From commit 7d8faaf155454f ("mm/madvise: introduce MADV_COLLAPSE sync
> > > hugepage collapse") :
> > >
> > >         ENOMEM  Memory allocation failed or VMA not found
> > >         EBUSY   Memcg charging failed
> > >         EAGAIN  Required resource temporarily unavailable.  Try again
> > >                 might succeed.
> > >         EINVAL  Other error: No PMD found, subpage doesn't have Present
> > >                 bit set, "Special" page no backed by struct page, VMA
> > >                 incorrectly sized, address not page-aligned, ...
> > >
> > > It sounds like ENOMEM/EBUSY/EINVAL should result in
> > > _notrun "Could not generate hugepage" ?  What are your thoughts?
> > >
> >
> > Thanks for running it overnight. This sounds good to me, but will this
> > be robust against false negatives? For example, if it succeeds when
> > we're doing the
> >
> > $here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q 'MADV_COLLAPSE not
> > supported' &&  _notrun "fsx binary does not support MADV_COLLAPSE"
> >
> > check, does that guarantee that the actual run won't error out with
> > ENOMEM/EBUSY/EINVAL?
>
> Nope.  That code only checks that the fsx binary was built with
> MADV_COLLAPSE support, not that any of it actually works.
>
> >                      It seems like there could be the case where it
> > passes the check but then when it actually runs, the system state
> > memory state may have changed and now memcg charging or the memory
> > allocation fails?
>
> Indeed, I think the error I saw is exactly this happening.
>
> In the end I turned the fsx -h invocation into a helper that checks for
> any of those three errors (ENOMEM/BUSY/INVAL) and _notruns the test if
> we can't even get a hugepage at the start.
>
> # Run fsx with -h(ugepage buffers).  If we can't set up a hugepage then skip
> # the test, but if any other error occurs then exit the test.
> _run_hugepage_fsx() {
>         _run_fsx "$@" -h &> $tmp.hugepage_fsx
>         local res=$?
>         if [ $res -eq 103 ]; then
>                 # According to the MADV_COLLAPSE manpage, these three errors
>                 # can happen if the kernel could not collapse a collection of
>                 # pages into a single huge page.
>                 grep -q -E ' for hugebuf: (Cannot allocate memory|Device or resource busy|Resource temporarily unavailable)' $tmp.hugepage_fsx && \
>                         _notrun "Could not set up huge page for test"
>         fi
>         cat $tmp.hugepage_fsx
>         rm -f $tmp.hugepage_fsx
>         test $res -ne 0 && exit 1
>         return 0
> }

Awesome, thank you!

>
> Note that it won't prevent or paper over subsequent MADV_COLLAPSE
> failures once the process is up and running, though as long as nothing
> else in fsx fails or corrupts the file, the collapse failures won't be
> reported.  (As is the case now).
>
> Anyway I'll send patches for both issues.
>
> > allocation fails? EAGAIN seems a bit iffy to me - hopefully this
> > doesn't happen often but if it does, it would likely be a false
> > negative fail for the generic test?
> >
> > Maybe instead of "$here/ltp/fsx -N 0 -h $TEST_DIR 2>&1 | grep -q
> > 'MADV_COLLAPSE not supported' &&  _notrun "fsx binary does not support
> > MADV_COLLAPSE"", we should run the test and then if the output to the
> > .out file is "MADV_COLLAPSE not supported", then we deem that a
> > pass/success? Not sure if this is possible in the fstest
> > infrastructure though for checking against two possible outputs in the
> > .out file. What are your thoughts?
>
> You ... can switch the .out(put) dynamically, but the mechanism to do it
> is underdocumented and quirky; and it makes understanding what the test
> does rather difficult.  See _link_out_file in xfs/614 if you want to
> open that Pandora's box.
>
> --D
>
> >
> >
> > Thanks,
> > Joanne
> >
> > > --D
> > >
> > > > --D
> > > >
> > > > >
> > > > > > --D
> > > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Joanne
> > > > > > >
> > > > > > > >  psize=`$here/src/feature -s`
> > > > > > > >  bsize=`$here/src/min_dio_alignment $TEST_DIR $TEST_DEV`
> > > > > > > >
> > > > > > >
> > > >
> >
diff mbox series

Patch

diff --git a/ltp/fsx.c b/ltp/fsx.c
index 41933354..3be383c6 100644
--- a/ltp/fsx.c
+++ b/ltp/fsx.c
@@ -190,6 +190,16 @@  int	o_direct;			/* -Z */
 int	aio = 0;
 int	uring = 0;
 int	mark_nr = 0;
+int	hugepages = 0;                  /* -h flag */
+
+/* Stores info needed to periodically collapse hugepages */
+struct hugepages_collapse_info {
+	void *orig_good_buf;
+	long good_buf_size;
+	void *orig_temp_buf;
+	long temp_buf_size;
+};
+struct hugepages_collapse_info hugepages_info;
 
 int page_size;
 int page_mask;
@@ -2471,7 +2481,7 @@  void
 usage(void)
 {
 	fprintf(stdout, "usage: %s",
-		"fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
+		"fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
 	   [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
 	   [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
 	   [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
@@ -2483,8 +2493,11 @@  usage(void)
 	-d: debug output for all operations\n\
 	-e: pollute post-eof on size changes (default 0)\n\
 	-f: flush and invalidate cache after I/O\n\
-	-g X: write character X instead of random generated data\n\
-	-i logdev: do integrity testing, logdev is the dm log writes device\n\
+	-g X: write character X instead of random generated data\n"
+#ifdef MADV_COLLAPSE
+"	-h hugepages: use buffers backed by hugepages for reads/writes\n"
+#endif
+"	-i logdev: do integrity testing, logdev is the dm log writes device\n\
 	-j logid: prefix debug log messsages with this id\n\
 	-k: do not truncate existing file and use its size as upper bound on file size\n\
 	-l flen: the upper bound on file size (default 262144)\n\
@@ -2833,11 +2846,41 @@  __test_fallocate(int mode, const char *mode_str)
 #endif
 }
 
+/*
+ * Reclaim may break up hugepages, so do a best-effort collapse every once in
+ * a while.
+ */
+static void
+collapse_hugepages(void)
+{
+#ifdef MADV_COLLAPSE
+	int ret;
+
+	/* re-collapse every 16k fsxops after we start */
+	if (!numops || (numops & ((1U << 14) - 1)))
+		return;
+
+	ret = madvise(hugepages_info.orig_good_buf,
+		      hugepages_info.good_buf_size, MADV_COLLAPSE);
+	if (ret)
+		prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
+		     numops, strerror(errno));
+	ret = madvise(hugepages_info.orig_temp_buf,
+		      hugepages_info.temp_buf_size, MADV_COLLAPSE);
+	if (ret)
+		prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
+		     numops, strerror(errno));
+#endif
+}
+
 bool
 keep_running(void)
 {
 	int ret;
 
+	if (hugepages)
+	        collapse_hugepages();
+
 	if (deadline.tv_nsec) {
 		struct timespec now;
 
@@ -2856,6 +2899,103 @@  keep_running(void)
 	return numops-- != 0;
 }
 
+static long
+get_hugepage_size(void)
+{
+	const char str[] = "Hugepagesize:";
+	size_t str_len =  sizeof(str) - 1;
+	unsigned int hugepage_size = 0;
+	char buffer[64];
+	FILE *file;
+
+	file = fopen("/proc/meminfo", "r");
+	if (!file) {
+		prterr("get_hugepage_size: fopen /proc/meminfo");
+		return -1;
+	}
+	while (fgets(buffer, sizeof(buffer), file)) {
+		if (strncmp(buffer, str, str_len) == 0) {
+			sscanf(buffer + str_len, "%u", &hugepage_size);
+			break;
+		}
+	}
+	fclose(file);
+	if (!hugepage_size) {
+		prterr("get_hugepage_size: failed to find "
+			"hugepage size in /proc/meminfo\n");
+		return -1;
+	}
+
+	/* convert from KiB to bytes */
+	return hugepage_size << 10;
+}
+
+static void *
+init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
+{
+	void *buf = NULL;
+#ifdef MADV_COLLAPSE
+	int ret;
+	long size = roundup(len, hugepage_size) + alignment;
+
+	ret = posix_memalign(&buf, hugepage_size, size);
+	if (ret) {
+		prterr("posix_memalign for buf");
+		return NULL;
+	}
+	memset(buf, '\0', size);
+	ret = madvise(buf, size, MADV_COLLAPSE);
+	if (ret) {
+		prterr("madvise collapse for buf");
+		free(buf);
+		return NULL;
+	}
+
+	*buf_size = size;
+#endif
+	return buf;
+}
+
+static void
+init_buffers(void)
+{
+	int i;
+
+	original_buf = (char *) malloc(maxfilelen);
+	for (i = 0; i < maxfilelen; i++)
+		original_buf[i] = random() % 256;
+	if (hugepages) {
+		long hugepage_size = get_hugepage_size();
+		if (hugepage_size == -1) {
+			prterr("get_hugepage_size()");
+			exit(102);
+		}
+		good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
+					      &hugepages_info.good_buf_size);
+		if (!good_buf) {
+			prterr("init_hugepages_buf failed for good_buf");
+			exit(103);
+		}
+		hugepages_info.orig_good_buf = good_buf;
+
+		temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
+					      &hugepages_info.temp_buf_size);
+		if (!temp_buf) {
+			prterr("init_hugepages_buf failed for temp_buf");
+			exit(103);
+		}
+		hugepages_info.orig_temp_buf = temp_buf;
+	} else {
+		unsigned long good_buf_len = maxfilelen + writebdy;
+		unsigned long temp_buf_len = maxoplen + readbdy;
+
+		good_buf = calloc(1, good_buf_len);
+		temp_buf = calloc(1, temp_buf_len);
+	}
+	good_buf = round_ptr_up(good_buf, writebdy, 0);
+	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
+}
+
 static struct option longopts[] = {
 	{"replay-ops", required_argument, 0, 256},
 	{"record-ops", optional_argument, 0, 255},
@@ -2883,7 +3023,7 @@  main(int argc, char **argv)
 	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
 
 	while ((ch = getopt_long(argc, argv,
-				 "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
+				 "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
 				 longopts, NULL)) != EOF)
 		switch (ch) {
 		case 'b':
@@ -2916,6 +3056,14 @@  main(int argc, char **argv)
 		case 'g':
 			filldata = *optarg;
 			break;
+		case 'h':
+#ifndef MADV_COLLAPSE
+			fprintf(stderr, "MADV_COLLAPSE not supported. "
+				"Can't support -h\n");
+			exit(86);
+#endif
+			hugepages = 1;
+			break;
 		case 'i':
 			integrity = 1;
 			logdev = strdup(optarg);
@@ -3229,15 +3377,7 @@  main(int argc, char **argv)
 			exit(95);
 		}
 	}
-	original_buf = (char *) malloc(maxfilelen);
-	for (i = 0; i < maxfilelen; i++)
-		original_buf[i] = random() % 256;
-	good_buf = (char *) malloc(maxfilelen + writebdy);
-	good_buf = round_ptr_up(good_buf, writebdy, 0);
-	memset(good_buf, '\0', maxfilelen);
-	temp_buf = (char *) malloc(maxoplen + readbdy);
-	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
-	memset(temp_buf, '\0', maxoplen);
+	init_buffers();
 	if (lite) {	/* zero entire existing file */
 		ssize_t written;