diff mbox series

[v3,1/2] fsx: support reads/writes from buffers backed by hugepages

Message ID 20250115183107.3124743-2-joannelkoong@gmail.com (mailing list archive)
State New
Headers show
Series fstests: test reads/writes from hugepages-backed buffers | expand

Commit Message

Joanne Koong Jan. 15, 2025, 6:31 p.m. UTC
Add support for reads/writes from buffers backed by hugepages.
This can be enabled through the '-h' flag. This flag should only be used
on systems where THP capabilities are enabled.

This is motivated by a recent bug that was due to faulty handling of
userspace buffers backed by hugepages. This patch is a mitigation
against problems like this in the future.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 108 insertions(+), 11 deletions(-)

Comments

Darrick J. Wong Jan. 15, 2025, 9:37 p.m. UTC | #1
On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> Add support for reads/writes from buffers backed by hugepages.
> This can be enabled through the '-h' flag. This flag should only be used
> on systems where THP capabilities are enabled.
> 
> This is motivated by a recent bug that was due to faulty handling of
> userspace buffers backed by hugepages. This patch is a mitigation
> against problems like this in the future.
> 
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> ---
>  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 108 insertions(+), 11 deletions(-)
> 
> diff --git a/ltp/fsx.c b/ltp/fsx.c
> index 41933354..8d3a2e2c 100644
> --- a/ltp/fsx.c
> +++ b/ltp/fsx.c
> @@ -190,6 +190,7 @@ int	o_direct;			/* -Z */
>  int	aio = 0;
>  int	uring = 0;
>  int	mark_nr = 0;
> +int	hugepages = 0;                  /* -h flag */
>  
>  int page_size;
>  int page_mask;
> @@ -2471,7 +2472,7 @@ void
>  usage(void)
>  {
>  	fprintf(stdout, "usage: %s",
> -		"fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> +		"fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
>  	   [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
>  	   [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
>  	   [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> @@ -2484,6 +2485,7 @@ usage(void)
>  	-e: pollute post-eof on size changes (default 0)\n\
>  	-f: flush and invalidate cache after I/O\n\
>  	-g X: write character X instead of random generated data\n\
> +	-h hugepages: use buffers backed by hugepages for reads/writes\n\

If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
describe the switch if the support wasn't compiled in?

e.g.

	-g X: write character X instead of random generated data\n"
#ifdef MADV_COLLAPSE
"	-h hugepages: use buffers backed by hugepages for reads/writes\n"
#endif
"	-i logdev: do integrity testing, logdev is the dm log writes device\n\

(assuming I got the preprocessor and string construction goo right; I
might be a few cards short of a deck due to zombie attack earlier)

>  	-i logdev: do integrity testing, logdev is the dm log writes device\n\
>  	-j logid: prefix debug log messsages with this id\n\
>  	-k: do not truncate existing file and use its size as upper bound on file size\n\
> @@ -2856,6 +2858,101 @@ keep_running(void)
>  	return numops-- != 0;
>  }
>  
> +static long
> +get_hugepage_size(void)
> +{
> +	const char str[] = "Hugepagesize:";
> +	size_t str_len =  sizeof(str) - 1;
> +	unsigned int hugepage_size = 0;
> +	char buffer[64];
> +	FILE *file;
> +
> +	file = fopen("/proc/meminfo", "r");
> +	if (!file) {
> +		prterr("get_hugepage_size: fopen /proc/meminfo");
> +		return -1;
> +	}
> +	while (fgets(buffer, sizeof(buffer), file)) {
> +		if (strncmp(buffer, str, str_len) == 0) {
> +			sscanf(buffer + str_len, "%u", &hugepage_size);
> +			break;
> +		}
> +	}
> +	fclose(file);
> +	if (!hugepage_size) {
> +		prterr("get_hugepage_size: failed to find "
> +			"hugepage size in /proc/meminfo\n");
> +		return -1;
> +	}
> +
> +	/* convert from KiB to bytes */
> +	return hugepage_size << 10;
> +}
> +
> +#ifdef MADV_COLLAPSE
> +static void *
> +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> +{
> +	void *buf;
> +	long buf_size = roundup(len, hugepage_size) + alignment;
> +
> +	if (posix_memalign(&buf, hugepage_size, buf_size)) {
> +		prterr("posix_memalign for buf");
> +		return NULL;
> +	}
> +	memset(buf, '\0', buf_size);
> +	if (madvise(buf, buf_size, MADV_COLLAPSE)) {

If the fsx runs for a long period of time, will it be necessary to call
MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
hugepage?

> +		prterr("madvise collapse for buf");
> +		free(buf);
> +		return NULL;
> +	}
> +
> +	return buf;
> +}
> +#else
> +static void *
> +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> +{
> +	return NULL;
> +}
> +#endif
> +
> +static void
> +init_buffers(void)
> +{
> +	int i;
> +
> +	original_buf = (char *) malloc(maxfilelen);
> +	for (i = 0; i < maxfilelen; i++)
> +		original_buf[i] = random() % 256;
> +	if (hugepages) {
> +		long hugepage_size = get_hugepage_size();
> +		if (hugepage_size == -1) {
> +			prterr("get_hugepage_size()");
> +			exit(102);
> +		}
> +		good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> +		if (!good_buf) {
> +			prterr("init_hugepages_buf failed for good_buf");
> +			exit(103);
> +		}
> +
> +		temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> +		if (!temp_buf) {
> +			prterr("init_hugepages_buf failed for temp_buf");
> +			exit(103);
> +		}
> +	} else {
> +		unsigned long good_buf_len = maxfilelen + writebdy;
> +		unsigned long temp_buf_len = maxoplen + readbdy;
> +
> +		good_buf = calloc(1, good_buf_len);
> +		temp_buf = calloc(1, temp_buf_len);
> +	}
> +	good_buf = round_ptr_up(good_buf, writebdy, 0);
> +	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> +}
> +
>  static struct option longopts[] = {
>  	{"replay-ops", required_argument, 0, 256},
>  	{"record-ops", optional_argument, 0, 255},
> @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
>  	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
>  
>  	while ((ch = getopt_long(argc, argv,
> -				 "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> +				 "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
>  				 longopts, NULL)) != EOF)
>  		switch (ch) {
>  		case 'b':
> @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
>  		case 'g':
>  			filldata = *optarg;
>  			break;
> +		case 'h':
> +			#ifndef MADV_COLLAPSE

Preprocessor directives should start at column 0, like most of the rest
of fstests.

--D

> +				fprintf(stderr, "MADV_COLLAPSE not supported. "
> +					"Can't support -h\n");
> +				exit(86);
> +			#endif
> +			hugepages = 1;
> +			break;
>  		case 'i':
>  			integrity = 1;
>  			logdev = strdup(optarg);
> @@ -3229,15 +3334,7 @@ main(int argc, char **argv)
>  			exit(95);
>  		}
>  	}
> -	original_buf = (char *) malloc(maxfilelen);
> -	for (i = 0; i < maxfilelen; i++)
> -		original_buf[i] = random() % 256;
> -	good_buf = (char *) malloc(maxfilelen + writebdy);
> -	good_buf = round_ptr_up(good_buf, writebdy, 0);
> -	memset(good_buf, '\0', maxfilelen);
> -	temp_buf = (char *) malloc(maxoplen + readbdy);
> -	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> -	memset(temp_buf, '\0', maxoplen);
> +	init_buffers();
>  	if (lite) {	/* zero entire existing file */
>  		ssize_t written;
>  
> -- 
> 2.47.1
> 
>
Joanne Koong Jan. 16, 2025, 12:47 a.m. UTC | #2
On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > Add support for reads/writes from buffers backed by hugepages.
> > This can be enabled through the '-h' flag. This flag should only be used
> > on systems where THP capabilities are enabled.
> >
> > This is motivated by a recent bug that was due to faulty handling of
> > userspace buffers backed by hugepages. This patch is a mitigation
> > against problems like this in the future.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > ---
> >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 108 insertions(+), 11 deletions(-)
> >
> > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > index 41933354..8d3a2e2c 100644
> > --- a/ltp/fsx.c
> > +++ b/ltp/fsx.c
> > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> >  int  aio = 0;
> >  int  uring = 0;
> >  int  mark_nr = 0;
> > +int  hugepages = 0;                  /* -h flag */
> >
> >  int page_size;
> >  int page_mask;
> > @@ -2471,7 +2472,7 @@ void
> >  usage(void)
> >  {
> >       fprintf(stdout, "usage: %s",
> > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > @@ -2484,6 +2485,7 @@ usage(void)
> >       -e: pollute post-eof on size changes (default 0)\n\
> >       -f: flush and invalidate cache after I/O\n\
> >       -g X: write character X instead of random generated data\n\
> > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
>
> If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> describe the switch if the support wasn't compiled in?
>
> e.g.
>
>         -g X: write character X instead of random generated data\n"
> #ifdef MADV_COLLAPSE
> "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> #endif
> "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
>
> (assuming I got the preprocessor and string construction goo right; I
> might be a few cards short of a deck due to zombie attack earlier)

Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
>
> >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> >       -j logid: prefix debug log messsages with this id\n\
> >       -k: do not truncate existing file and use its size as upper bound on file size\n\
[...]
> > +}
> > +
> > +#ifdef MADV_COLLAPSE
> > +static void *
> > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > +{
> > +     void *buf;
> > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > +
> > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > +             prterr("posix_memalign for buf");
> > +             return NULL;
> > +     }
> > +     memset(buf, '\0', buf_size);
> > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
>
> If the fsx runs for a long period of time, will it be necessary to call
> MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> hugepage?
>

imo, I don't think so. My understanding is that this would be a rare
edge case that happens when the system is constrained on memory, in
which case subsequent calls to MADV_COLLAPSE would most likely fail
anyways.


Thanks,
Joanne

> > +             prterr("madvise collapse for buf");
> > +             free(buf);
> > +             return NULL;
> > +     }
> > +
> > +     return buf;
> > +}
> > +#else
> > +static void *
> > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > +{
> > +     return NULL;
> > +}
> > +#endif
> > +
> > +static void
> > +init_buffers(void)
> > +{
> > +     int i;
> > +
> > +     original_buf = (char *) malloc(maxfilelen);
> > +     for (i = 0; i < maxfilelen; i++)
> > +             original_buf[i] = random() % 256;
> > +     if (hugepages) {
> > +             long hugepage_size = get_hugepage_size();
> > +             if (hugepage_size == -1) {
> > +                     prterr("get_hugepage_size()");
> > +                     exit(102);
> > +             }
> > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > +             if (!good_buf) {
> > +                     prterr("init_hugepages_buf failed for good_buf");
> > +                     exit(103);
> > +             }
> > +
> > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > +             if (!temp_buf) {
> > +                     prterr("init_hugepages_buf failed for temp_buf");
> > +                     exit(103);
> > +             }
> > +     } else {
> > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > +
> > +             good_buf = calloc(1, good_buf_len);
> > +             temp_buf = calloc(1, temp_buf_len);
> > +     }
> > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > +}
> > +
> >  static struct option longopts[] = {
> >       {"replay-ops", required_argument, 0, 256},
> >       {"record-ops", optional_argument, 0, 255},
> > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> >
> >       while ((ch = getopt_long(argc, argv,
> > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> >                                longopts, NULL)) != EOF)
> >               switch (ch) {
> >               case 'b':
> > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> >               case 'g':
> >                       filldata = *optarg;
> >                       break;
> > +             case 'h':
> > +                     #ifndef MADV_COLLAPSE
>
> Preprocessor directives should start at column 0, like most of the rest
> of fstests.
>
> --D
>
Darrick J. Wong Jan. 16, 2025, 12:59 a.m. UTC | #3
On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > Add support for reads/writes from buffers backed by hugepages.
> > > This can be enabled through the '-h' flag. This flag should only be used
> > > on systems where THP capabilities are enabled.
> > >
> > > This is motivated by a recent bug that was due to faulty handling of
> > > userspace buffers backed by hugepages. This patch is a mitigation
> > > against problems like this in the future.
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > ---
> > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > index 41933354..8d3a2e2c 100644
> > > --- a/ltp/fsx.c
> > > +++ b/ltp/fsx.c
> > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > >  int  aio = 0;
> > >  int  uring = 0;
> > >  int  mark_nr = 0;
> > > +int  hugepages = 0;                  /* -h flag */
> > >
> > >  int page_size;
> > >  int page_mask;
> > > @@ -2471,7 +2472,7 @@ void
> > >  usage(void)
> > >  {
> > >       fprintf(stdout, "usage: %s",
> > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > @@ -2484,6 +2485,7 @@ usage(void)
> > >       -e: pollute post-eof on size changes (default 0)\n\
> > >       -f: flush and invalidate cache after I/O\n\
> > >       -g X: write character X instead of random generated data\n\
> > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> >
> > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > describe the switch if the support wasn't compiled in?
> >
> > e.g.
> >
> >         -g X: write character X instead of random generated data\n"
> > #ifdef MADV_COLLAPSE
> > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > #endif
> > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> >
> > (assuming I got the preprocessor and string construction goo right; I
> > might be a few cards short of a deck due to zombie attack earlier)
> 
> Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> >
> > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > >       -j logid: prefix debug log messsages with this id\n\
> > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> [...]
> > > +}
> > > +
> > > +#ifdef MADV_COLLAPSE
> > > +static void *
> > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > +{
> > > +     void *buf;
> > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > +
> > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > +             prterr("posix_memalign for buf");
> > > +             return NULL;
> > > +     }
> > > +     memset(buf, '\0', buf_size);
> > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> >
> > If the fsx runs for a long period of time, will it be necessary to call
> > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > hugepage?
> >
> 
> imo, I don't think so. My understanding is that this would be a rare
> edge case that happens when the system is constrained on memory, in
> which case subsequent calls to MADV_COLLAPSE would most likely fail
> anyways.

Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
into stressing the filesystem more.  But I guess there's no good way for
fsx to know that something happened to it.  Unless there's some even
goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)

Will have to ponder hugepage renewasl -- maybe we should madvise every
few thousand fsxops just to be careful?

--D

> 
> Thanks,
> Joanne
> 
> > > +             prterr("madvise collapse for buf");
> > > +             free(buf);
> > > +             return NULL;
> > > +     }
> > > +
> > > +     return buf;
> > > +}
> > > +#else
> > > +static void *
> > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > +{
> > > +     return NULL;
> > > +}
> > > +#endif
> > > +
> > > +static void
> > > +init_buffers(void)
> > > +{
> > > +     int i;
> > > +
> > > +     original_buf = (char *) malloc(maxfilelen);
> > > +     for (i = 0; i < maxfilelen; i++)
> > > +             original_buf[i] = random() % 256;
> > > +     if (hugepages) {
> > > +             long hugepage_size = get_hugepage_size();
> > > +             if (hugepage_size == -1) {
> > > +                     prterr("get_hugepage_size()");
> > > +                     exit(102);
> > > +             }
> > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > +             if (!good_buf) {
> > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > +                     exit(103);
> > > +             }
> > > +
> > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > +             if (!temp_buf) {
> > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > +                     exit(103);
> > > +             }
> > > +     } else {
> > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > +
> > > +             good_buf = calloc(1, good_buf_len);
> > > +             temp_buf = calloc(1, temp_buf_len);
> > > +     }
> > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > +}
> > > +
> > >  static struct option longopts[] = {
> > >       {"replay-ops", required_argument, 0, 256},
> > >       {"record-ops", optional_argument, 0, 255},
> > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > >
> > >       while ((ch = getopt_long(argc, argv,
> > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > >                                longopts, NULL)) != EOF)
> > >               switch (ch) {
> > >               case 'b':
> > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > >               case 'g':
> > >                       filldata = *optarg;
> > >                       break;
> > > +             case 'h':
> > > +                     #ifndef MADV_COLLAPSE
> >
> > Preprocessor directives should start at column 0, like most of the rest
> > of fstests.
> >
> > --D
> >
Brian Foster Jan. 16, 2025, 12:53 p.m. UTC | #4
On Wed, Jan 15, 2025 at 04:59:19PM -0800, Darrick J. Wong wrote:
> On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > Add support for reads/writes from buffers backed by hugepages.
> > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > on systems where THP capabilities are enabled.
> > > >
> > > > This is motivated by a recent bug that was due to faulty handling of
> > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > against problems like this in the future.
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > ---
> > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > index 41933354..8d3a2e2c 100644
> > > > --- a/ltp/fsx.c
> > > > +++ b/ltp/fsx.c
> > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > >  int  aio = 0;
> > > >  int  uring = 0;
> > > >  int  mark_nr = 0;
> > > > +int  hugepages = 0;                  /* -h flag */
> > > >
> > > >  int page_size;
> > > >  int page_mask;
> > > > @@ -2471,7 +2472,7 @@ void
> > > >  usage(void)
> > > >  {
> > > >       fprintf(stdout, "usage: %s",
> > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > >       -f: flush and invalidate cache after I/O\n\
> > > >       -g X: write character X instead of random generated data\n\
> > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > >
> > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > describe the switch if the support wasn't compiled in?
> > >
> > > e.g.
> > >
> > >         -g X: write character X instead of random generated data\n"
> > > #ifdef MADV_COLLAPSE
> > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > #endif
> > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > >
> > > (assuming I got the preprocessor and string construction goo right; I
> > > might be a few cards short of a deck due to zombie attack earlier)
> > 
> > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > >
> > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > >       -j logid: prefix debug log messsages with this id\n\
> > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > [...]
> > > > +}
> > > > +
> > > > +#ifdef MADV_COLLAPSE
> > > > +static void *
> > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > +{
> > > > +     void *buf;
> > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > +
> > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > +             prterr("posix_memalign for buf");
> > > > +             return NULL;
> > > > +     }
> > > > +     memset(buf, '\0', buf_size);
> > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > >
> > > If the fsx runs for a long period of time, will it be necessary to call
> > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > hugepage?
> > >
> > 
> > imo, I don't think so. My understanding is that this would be a rare
> > edge case that happens when the system is constrained on memory, in
> > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > anyways.
> 
> Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> into stressing the filesystem more.  But I guess there's no good way for
> fsx to know that something happened to it.  Unless there's some even
> goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> 
> Will have to ponder hugepage renewasl -- maybe we should madvise every
> few thousand fsxops just to be careful?
> 

I wonder.. is there test value in doing collapses to the target file as
well, either as a standalone map/madvise command or a random thing
hitched onto preexisting commands? If so, I could see how something like
that could potentially lift the current init time only approach into
something that occurs with frequency, which then could at the same time
(again maybe randomly) reinvoke for internal buffers as well.

All that said, this is new functionality and IIUC provides functional
test coverage for a valid issue. IMO, it would be nice to get this
merged as a baseline feature and explore these sort of enhancements as
followon work. Just my .02.

Brian

> --D
> 
> > 
> > Thanks,
> > Joanne
> > 
> > > > +             prterr("madvise collapse for buf");
> > > > +             free(buf);
> > > > +             return NULL;
> > > > +     }
> > > > +
> > > > +     return buf;
> > > > +}
> > > > +#else
> > > > +static void *
> > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > +{
> > > > +     return NULL;
> > > > +}
> > > > +#endif
> > > > +
> > > > +static void
> > > > +init_buffers(void)
> > > > +{
> > > > +     int i;
> > > > +
> > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > +     for (i = 0; i < maxfilelen; i++)
> > > > +             original_buf[i] = random() % 256;
> > > > +     if (hugepages) {
> > > > +             long hugepage_size = get_hugepage_size();
> > > > +             if (hugepage_size == -1) {
> > > > +                     prterr("get_hugepage_size()");
> > > > +                     exit(102);
> > > > +             }
> > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > +             if (!good_buf) {
> > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > +                     exit(103);
> > > > +             }
> > > > +
> > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > +             if (!temp_buf) {
> > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > +                     exit(103);
> > > > +             }
> > > > +     } else {
> > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > +
> > > > +             good_buf = calloc(1, good_buf_len);
> > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > +     }
> > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > +}
> > > > +
> > > >  static struct option longopts[] = {
> > > >       {"replay-ops", required_argument, 0, 256},
> > > >       {"record-ops", optional_argument, 0, 255},
> > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > >
> > > >       while ((ch = getopt_long(argc, argv,
> > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > >                                longopts, NULL)) != EOF)
> > > >               switch (ch) {
> > > >               case 'b':
> > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > >               case 'g':
> > > >                       filldata = *optarg;
> > > >                       break;
> > > > +             case 'h':
> > > > +                     #ifndef MADV_COLLAPSE
> > >
> > > Preprocessor directives should start at column 0, like most of the rest
> > > of fstests.
> > >
> > > --D
> > >
>
Joanne Koong Jan. 17, 2025, 1:03 a.m. UTC | #5
On Wed, Jan 15, 2025 at 4:59 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > Add support for reads/writes from buffers backed by hugepages.
> > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > on systems where THP capabilities are enabled.
> > > >
> > > > This is motivated by a recent bug that was due to faulty handling of
> > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > against problems like this in the future.
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > ---
> > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > index 41933354..8d3a2e2c 100644
> > > > --- a/ltp/fsx.c
> > > > +++ b/ltp/fsx.c
> > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > >  int  aio = 0;
> > > >  int  uring = 0;
> > > >  int  mark_nr = 0;
> > > > +int  hugepages = 0;                  /* -h flag */
> > > >
> > > >  int page_size;
> > > >  int page_mask;
> > > > @@ -2471,7 +2472,7 @@ void
> > > >  usage(void)
> > > >  {
> > > >       fprintf(stdout, "usage: %s",
> > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > >       -f: flush and invalidate cache after I/O\n\
> > > >       -g X: write character X instead of random generated data\n\
> > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > >
> > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > describe the switch if the support wasn't compiled in?
> > >
> > > e.g.
> > >
> > >         -g X: write character X instead of random generated data\n"
> > > #ifdef MADV_COLLAPSE
> > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > #endif
> > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > >
> > > (assuming I got the preprocessor and string construction goo right; I
> > > might be a few cards short of a deck due to zombie attack earlier)
> >
> > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > >
> > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > >       -j logid: prefix debug log messsages with this id\n\
> > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > [...]
> > > > +}
> > > > +
> > > > +#ifdef MADV_COLLAPSE
> > > > +static void *
> > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > +{
> > > > +     void *buf;
> > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > +
> > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > +             prterr("posix_memalign for buf");
> > > > +             return NULL;
> > > > +     }
> > > > +     memset(buf, '\0', buf_size);
> > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > >
> > > If the fsx runs for a long period of time, will it be necessary to call
> > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > hugepage?
> > >
> >
> > imo, I don't think so. My understanding is that this would be a rare
> > edge case that happens when the system is constrained on memory, in
> > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > anyways.
>
> Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> into stressing the filesystem more.  But I guess there's no good way for
> fsx to know that something happened to it.  Unless there's some even
> goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)

I can't think of a better way to force a hugepage either. I believe
shmem and hugetlbfs would both require root privileges to do so, and
if i'm not mistaken, shmem hugepages are still subject to being broken
up by reclaim.

>
> Will have to ponder hugepage renewasl -- maybe we should madvise every
> few thousand fsxops just to be careful?

I can add this in, but on memory constrained VMs, would this be
effective? To me, it seems like in the majority of cases, subsequent
attempts at collapsing the broken pages back into a hugepage would
fail due to memory still being constrained. In which case, I guess
we'd exit the test altogether? It kind of seems to me like if the user
wants to test out hugepages functionality of their filesystem, then
the onus is on them to run the test in an environment that can
adequately and consistently support hugepages.

Thanks,
Joanne

>
> --D
>
> >
> > Thanks,
> > Joanne
> >
> > > > +             prterr("madvise collapse for buf");
> > > > +             free(buf);
> > > > +             return NULL;
> > > > +     }
> > > > +
> > > > +     return buf;
> > > > +}
> > > > +#else
> > > > +static void *
> > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > +{
> > > > +     return NULL;
> > > > +}
> > > > +#endif
> > > > +
> > > > +static void
> > > > +init_buffers(void)
> > > > +{
> > > > +     int i;
> > > > +
> > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > +     for (i = 0; i < maxfilelen; i++)
> > > > +             original_buf[i] = random() % 256;
> > > > +     if (hugepages) {
> > > > +             long hugepage_size = get_hugepage_size();
> > > > +             if (hugepage_size == -1) {
> > > > +                     prterr("get_hugepage_size()");
> > > > +                     exit(102);
> > > > +             }
> > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > +             if (!good_buf) {
> > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > +                     exit(103);
> > > > +             }
> > > > +
> > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > +             if (!temp_buf) {
> > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > +                     exit(103);
> > > > +             }
> > > > +     } else {
> > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > +
> > > > +             good_buf = calloc(1, good_buf_len);
> > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > +     }
> > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > +}
> > > > +
> > > >  static struct option longopts[] = {
> > > >       {"replay-ops", required_argument, 0, 256},
> > > >       {"record-ops", optional_argument, 0, 255},
> > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > >
> > > >       while ((ch = getopt_long(argc, argv,
> > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > >                                longopts, NULL)) != EOF)
> > > >               switch (ch) {
> > > >               case 'b':
> > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > >               case 'g':
> > > >                       filldata = *optarg;
> > > >                       break;
> > > > +             case 'h':
> > > > +                     #ifndef MADV_COLLAPSE
> > >
> > > Preprocessor directives should start at column 0, like most of the rest
> > > of fstests.
> > >
> > > --D
> > >
Joanne Koong Jan. 17, 2025, 1:26 a.m. UTC | #6
On Thu, Jan 16, 2025 at 4:51 AM Brian Foster <bfoster@redhat.com> wrote:
>
> On Wed, Jan 15, 2025 at 04:59:19PM -0800, Darrick J. Wong wrote:
> > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > >
> > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > on systems where THP capabilities are enabled.
> > > > >
> > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > against problems like this in the future.
> > > > >
> > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > ---
> > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > >
> > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > index 41933354..8d3a2e2c 100644
> > > > > --- a/ltp/fsx.c
> > > > > +++ b/ltp/fsx.c
> > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > >  int  aio = 0;
> > > > >  int  uring = 0;
> > > > >  int  mark_nr = 0;
> > > > > +int  hugepages = 0;                  /* -h flag */
> > > > >
> > > > >  int page_size;
> > > > >  int page_mask;
> > > > > @@ -2471,7 +2472,7 @@ void
> > > > >  usage(void)
> > > > >  {
> > > > >       fprintf(stdout, "usage: %s",
> > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > >       -f: flush and invalidate cache after I/O\n\
> > > > >       -g X: write character X instead of random generated data\n\
> > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > >
> > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > describe the switch if the support wasn't compiled in?
> > > >
> > > > e.g.
> > > >
> > > >         -g X: write character X instead of random generated data\n"
> > > > #ifdef MADV_COLLAPSE
> > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > #endif
> > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > >
> > > > (assuming I got the preprocessor and string construction goo right; I
> > > > might be a few cards short of a deck due to zombie attack earlier)
> > >
> > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > >
> > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > [...]
> > > > > +}
> > > > > +
> > > > > +#ifdef MADV_COLLAPSE
> > > > > +static void *
> > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > +{
> > > > > +     void *buf;
> > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > +
> > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > +             prterr("posix_memalign for buf");
> > > > > +             return NULL;
> > > > > +     }
> > > > > +     memset(buf, '\0', buf_size);
> > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > >
> > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > hugepage?
> > > >
> > >
> > > imo, I don't think so. My understanding is that this would be a rare
> > > edge case that happens when the system is constrained on memory, in
> > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > anyways.
> >
> > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > into stressing the filesystem more.  But I guess there's no good way for
> > fsx to know that something happened to it.  Unless there's some even
> > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> >
> > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > few thousand fsxops just to be careful?
> >
>
> I wonder.. is there test value in doing collapses to the target file as
> well, either as a standalone map/madvise command or a random thing
> hitched onto preexisting commands? If so, I could see how something like
> that could potentially lift the current init time only approach into
> something that occurs with frequency, which then could at the same time
> (again maybe randomly) reinvoke for internal buffers as well.

My understanding is that if a filesystem has support enabled for large
folios, then doing large writes/reads (which I believe is currently
supported in fsx via the -o flag) will already automatically test the
functionality of how the filesystem handles hugepages. I don't think
this would be different from what doing a collapse on the target file
would do.


Thanks,
Joanne

>
> All that said, this is new functionality and IIUC provides functional
> test coverage for a valid issue. IMO, it would be nice to get this
> merged as a baseline feature and explore these sort of enhancements as
> followon work. Just my .02.
>
> Brian
>
> > --D
> >
> > >
> > > Thanks,
> > > Joanne
> > >
> > > > > +             prterr("madvise collapse for buf");
> > > > > +             free(buf);
> > > > > +             return NULL;
> > > > > +     }
> > > > > +
> > > > > +     return buf;
> > > > > +}
> > > > > +#else
> > > > > +static void *
> > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > +{
> > > > > +     return NULL;
> > > > > +}
> > > > > +#endif
> > > > > +
> > > > > +static void
> > > > > +init_buffers(void)
> > > > > +{
> > > > > +     int i;
> > > > > +
> > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > +             original_buf[i] = random() % 256;
> > > > > +     if (hugepages) {
> > > > > +             long hugepage_size = get_hugepage_size();
> > > > > +             if (hugepage_size == -1) {
> > > > > +                     prterr("get_hugepage_size()");
> > > > > +                     exit(102);
> > > > > +             }
> > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > +             if (!good_buf) {
> > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > +                     exit(103);
> > > > > +             }
> > > > > +
> > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > +             if (!temp_buf) {
> > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > +                     exit(103);
> > > > > +             }
> > > > > +     } else {
> > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > +
> > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > +     }
> > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > +}
> > > > > +
> > > > >  static struct option longopts[] = {
> > > > >       {"replay-ops", required_argument, 0, 256},
> > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > >
> > > > >       while ((ch = getopt_long(argc, argv,
> > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > >                                longopts, NULL)) != EOF)
> > > > >               switch (ch) {
> > > > >               case 'b':
> > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > >               case 'g':
> > > > >                       filldata = *optarg;
> > > > >                       break;
> > > > > +             case 'h':
> > > > > +                     #ifndef MADV_COLLAPSE
> > > >
> > > > Preprocessor directives should start at column 0, like most of the rest
> > > > of fstests.
> > > >
> > > > --D
> > > >
> >
>
Darrick J. Wong Jan. 17, 2025, 1:57 a.m. UTC | #7
On Thu, Jan 16, 2025 at 05:03:41PM -0800, Joanne Koong wrote:
> On Wed, Jan 15, 2025 at 4:59 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > >
> > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > on systems where THP capabilities are enabled.
> > > > >
> > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > against problems like this in the future.
> > > > >
> > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > ---
> > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > >
> > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > index 41933354..8d3a2e2c 100644
> > > > > --- a/ltp/fsx.c
> > > > > +++ b/ltp/fsx.c
> > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > >  int  aio = 0;
> > > > >  int  uring = 0;
> > > > >  int  mark_nr = 0;
> > > > > +int  hugepages = 0;                  /* -h flag */
> > > > >
> > > > >  int page_size;
> > > > >  int page_mask;
> > > > > @@ -2471,7 +2472,7 @@ void
> > > > >  usage(void)
> > > > >  {
> > > > >       fprintf(stdout, "usage: %s",
> > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > >       -f: flush and invalidate cache after I/O\n\
> > > > >       -g X: write character X instead of random generated data\n\
> > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > >
> > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > describe the switch if the support wasn't compiled in?
> > > >
> > > > e.g.
> > > >
> > > >         -g X: write character X instead of random generated data\n"
> > > > #ifdef MADV_COLLAPSE
> > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > #endif
> > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > >
> > > > (assuming I got the preprocessor and string construction goo right; I
> > > > might be a few cards short of a deck due to zombie attack earlier)
> > >
> > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > >
> > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > [...]
> > > > > +}
> > > > > +
> > > > > +#ifdef MADV_COLLAPSE
> > > > > +static void *
> > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > +{
> > > > > +     void *buf;
> > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > +
> > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > +             prterr("posix_memalign for buf");
> > > > > +             return NULL;
> > > > > +     }
> > > > > +     memset(buf, '\0', buf_size);
> > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > >
> > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > hugepage?
> > > >
> > >
> > > imo, I don't think so. My understanding is that this would be a rare
> > > edge case that happens when the system is constrained on memory, in
> > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > anyways.
> >
> > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > into stressing the filesystem more.  But I guess there's no good way for
> > fsx to know that something happened to it.  Unless there's some even
> > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> 
> I can't think of a better way to force a hugepage either. I believe
> shmem and hugetlbfs would both require root privileges to do so, and
> if i'm not mistaken, shmem hugepages are still subject to being broken
> up by reclaim.
> 
> >
> > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > few thousand fsxops just to be careful?
> 
> I can add this in, but on memory constrained VMs, would this be
> effective? To me, it seems like in the majority of cases, subsequent

They're not /so/ memory constrained that the initial collapsed page is
likely to get split/reclaimed before your regression test finishes...

> attempts at collapsing the broken pages back into a hugepage would
> fail due to memory still being constrained. In which case, I guess
> we'd exit the test altogether?

...but I was starting to wonder if this is something we'd actually want
in a long soak test, just in case there are weird effects on the system
after fsx has been running for a few days?

>                                It kind of seems to me like if the user
> wants to test out hugepages functionality of their filesystem, then
> the onus is on them to run the test in an environment that can
> adequately and consistently support hugepages.

I guess the hard part about analyzing that is is that long soak tests
aren't usually supposed to die on account of memory fragmentation.
Hence me wondering if there's an "easy" way to get one huge page and not
let it go.

Anyway don't let my blathering hold this up; I think once you fix the
help text and the #ifdef indenting around exit(86) this patch is good to
go.

--D

> 
> Thanks,
> Joanne
> 
> >
> > --D
> >
> > >
> > > Thanks,
> > > Joanne
> > >
> > > > > +             prterr("madvise collapse for buf");
> > > > > +             free(buf);
> > > > > +             return NULL;
> > > > > +     }
> > > > > +
> > > > > +     return buf;
> > > > > +}
> > > > > +#else
> > > > > +static void *
> > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > +{
> > > > > +     return NULL;
> > > > > +}
> > > > > +#endif
> > > > > +
> > > > > +static void
> > > > > +init_buffers(void)
> > > > > +{
> > > > > +     int i;
> > > > > +
> > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > +             original_buf[i] = random() % 256;
> > > > > +     if (hugepages) {
> > > > > +             long hugepage_size = get_hugepage_size();
> > > > > +             if (hugepage_size == -1) {
> > > > > +                     prterr("get_hugepage_size()");
> > > > > +                     exit(102);
> > > > > +             }
> > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > +             if (!good_buf) {
> > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > +                     exit(103);
> > > > > +             }
> > > > > +
> > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > +             if (!temp_buf) {
> > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > +                     exit(103);
> > > > > +             }
> > > > > +     } else {
> > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > +
> > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > +     }
> > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > +}
> > > > > +
> > > > >  static struct option longopts[] = {
> > > > >       {"replay-ops", required_argument, 0, 256},
> > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > >
> > > > >       while ((ch = getopt_long(argc, argv,
> > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > >                                longopts, NULL)) != EOF)
> > > > >               switch (ch) {
> > > > >               case 'b':
> > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > >               case 'g':
> > > > >                       filldata = *optarg;
> > > > >                       break;
> > > > > +             case 'h':
> > > > > +                     #ifndef MADV_COLLAPSE
> > > >
> > > > Preprocessor directives should start at column 0, like most of the rest
> > > > of fstests.
> > > >
> > > > --D
> > > >
Brian Foster Jan. 17, 2025, 1:27 p.m. UTC | #8
On Thu, Jan 16, 2025 at 05:26:31PM -0800, Joanne Koong wrote:
> On Thu, Jan 16, 2025 at 4:51 AM Brian Foster <bfoster@redhat.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 04:59:19PM -0800, Darrick J. Wong wrote:
> > > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > >
> > > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > on systems where THP capabilities are enabled.
> > > > > >
> > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > against problems like this in the future.
> > > > > >
> > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > ---
> > > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > > >
> > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > index 41933354..8d3a2e2c 100644
> > > > > > --- a/ltp/fsx.c
> > > > > > +++ b/ltp/fsx.c
> > > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > > >  int  aio = 0;
> > > > > >  int  uring = 0;
> > > > > >  int  mark_nr = 0;
> > > > > > +int  hugepages = 0;                  /* -h flag */
> > > > > >
> > > > > >  int page_size;
> > > > > >  int page_mask;
> > > > > > @@ -2471,7 +2472,7 @@ void
> > > > > >  usage(void)
> > > > > >  {
> > > > > >       fprintf(stdout, "usage: %s",
> > > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > > >       -f: flush and invalidate cache after I/O\n\
> > > > > >       -g X: write character X instead of random generated data\n\
> > > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > > >
> > > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > > describe the switch if the support wasn't compiled in?
> > > > >
> > > > > e.g.
> > > > >
> > > > >         -g X: write character X instead of random generated data\n"
> > > > > #ifdef MADV_COLLAPSE
> > > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > > #endif
> > > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > >
> > > > > (assuming I got the preprocessor and string construction goo right; I
> > > > > might be a few cards short of a deck due to zombie attack earlier)
> > > >
> > > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > > >
> > > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > > [...]
> > > > > > +}
> > > > > > +
> > > > > > +#ifdef MADV_COLLAPSE
> > > > > > +static void *
> > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > +{
> > > > > > +     void *buf;
> > > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > > +
> > > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > > +             prterr("posix_memalign for buf");
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > > +     memset(buf, '\0', buf_size);
> > > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > > >
> > > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > > hugepage?
> > > > >
> > > >
> > > > imo, I don't think so. My understanding is that this would be a rare
> > > > edge case that happens when the system is constrained on memory, in
> > > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > > anyways.
> > >
> > > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > > into stressing the filesystem more.  But I guess there's no good way for
> > > fsx to know that something happened to it.  Unless there's some even
> > > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> > >
> > > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > > few thousand fsxops just to be careful?
> > >
> >
> > I wonder.. is there test value in doing collapses to the target file as
> > well, either as a standalone map/madvise command or a random thing
> > hitched onto preexisting commands? If so, I could see how something like
> > that could potentially lift the current init time only approach into
> > something that occurs with frequency, which then could at the same time
> > (again maybe randomly) reinvoke for internal buffers as well.
> 
> My understanding is that if a filesystem has support enabled for large
> folios, then doing large writes/reads (which I believe is currently
> supported in fsx via the -o flag) will already automatically test the
> functionality of how the filesystem handles hugepages. I don't think
> this would be different from what doing a collapse on the target file
> would do.
> 

Ah, that is a good point. So maybe not that useful to have something
that would hook into writes. OTOH, fsx does a lot of random ops in the
general case. I wonder how likely it is to sustain large folios in a
typical long running test and whether explicit madvise calls thrown into
the mix would make any difference at all.

I suppose there may also be an argument that doing collapses provides
more test coverage than purely doing larger folio allocations at write
time..? I don't know the code well enough to say whether there is any
value there. FWIW, what I think is more interesting from the fsx side is
the oddball sequences of operations that it can create to uncover
similarly odd problems. IOW, in theory if we had a randomish "collapse
target range before next operation," would that effectively provide more
coverage with how the various supported ops interact with large folios
over current behavior?

But anyways, this is all nebulous and strikes me more as maybe something
interesting to play with as a potential future enhancement more than
anything. BTW, is there any good way to measure use of large folios in
general and/or on a particular file? I.e., collapse/split stats or some
such thing..? Thanks.

Brian

> 
> Thanks,
> Joanne
> 
> >
> > All that said, this is new functionality and IIUC provides functional
> > test coverage for a valid issue. IMO, it would be nice to get this
> > merged as a baseline feature and explore these sort of enhancements as
> > followon work. Just my .02.
> >
> > Brian
> >
> > > --D
> > >
> > > >
> > > > Thanks,
> > > > Joanne
> > > >
> > > > > > +             prterr("madvise collapse for buf");
> > > > > > +             free(buf);
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > > +
> > > > > > +     return buf;
> > > > > > +}
> > > > > > +#else
> > > > > > +static void *
> > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > +{
> > > > > > +     return NULL;
> > > > > > +}
> > > > > > +#endif
> > > > > > +
> > > > > > +static void
> > > > > > +init_buffers(void)
> > > > > > +{
> > > > > > +     int i;
> > > > > > +
> > > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > > +             original_buf[i] = random() % 256;
> > > > > > +     if (hugepages) {
> > > > > > +             long hugepage_size = get_hugepage_size();
> > > > > > +             if (hugepage_size == -1) {
> > > > > > +                     prterr("get_hugepage_size()");
> > > > > > +                     exit(102);
> > > > > > +             }
> > > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > > +             if (!good_buf) {
> > > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > > +                     exit(103);
> > > > > > +             }
> > > > > > +
> > > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > > +             if (!temp_buf) {
> > > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > > +                     exit(103);
> > > > > > +             }
> > > > > > +     } else {
> > > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > > +
> > > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > > +     }
> > > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > > +}
> > > > > > +
> > > > > >  static struct option longopts[] = {
> > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > >
> > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > >                                longopts, NULL)) != EOF)
> > > > > >               switch (ch) {
> > > > > >               case 'b':
> > > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > > >               case 'g':
> > > > > >                       filldata = *optarg;
> > > > > >                       break;
> > > > > > +             case 'h':
> > > > > > +                     #ifndef MADV_COLLAPSE
> > > > >
> > > > > Preprocessor directives should start at column 0, like most of the rest
> > > > > of fstests.
> > > > >
> > > > > --D
> > > > >
> > >
> >
>
Darrick J. Wong Jan. 17, 2025, 5:43 p.m. UTC | #9
On Fri, Jan 17, 2025 at 08:27:24AM -0500, Brian Foster wrote:
> On Thu, Jan 16, 2025 at 05:26:31PM -0800, Joanne Koong wrote:
> > On Thu, Jan 16, 2025 at 4:51 AM Brian Foster <bfoster@redhat.com> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 04:59:19PM -0800, Darrick J. Wong wrote:
> > > > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > >
> > > > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > on systems where THP capabilities are enabled.
> > > > > > >
> > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > against problems like this in the future.
> > > > > > >
> > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > ---
> > > > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > > > >
> > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > index 41933354..8d3a2e2c 100644
> > > > > > > --- a/ltp/fsx.c
> > > > > > > +++ b/ltp/fsx.c
> > > > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > > > >  int  aio = 0;
> > > > > > >  int  uring = 0;
> > > > > > >  int  mark_nr = 0;
> > > > > > > +int  hugepages = 0;                  /* -h flag */
> > > > > > >
> > > > > > >  int page_size;
> > > > > > >  int page_mask;
> > > > > > > @@ -2471,7 +2472,7 @@ void
> > > > > > >  usage(void)
> > > > > > >  {
> > > > > > >       fprintf(stdout, "usage: %s",
> > > > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > > > >       -f: flush and invalidate cache after I/O\n\
> > > > > > >       -g X: write character X instead of random generated data\n\
> > > > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > > > >
> > > > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > > > describe the switch if the support wasn't compiled in?
> > > > > >
> > > > > > e.g.
> > > > > >
> > > > > >         -g X: write character X instead of random generated data\n"
> > > > > > #ifdef MADV_COLLAPSE
> > > > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > > > #endif
> > > > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > >
> > > > > > (assuming I got the preprocessor and string construction goo right; I
> > > > > > might be a few cards short of a deck due to zombie attack earlier)
> > > > >
> > > > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > > > >
> > > > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > > > [...]
> > > > > > > +}
> > > > > > > +
> > > > > > > +#ifdef MADV_COLLAPSE
> > > > > > > +static void *
> > > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > > +{
> > > > > > > +     void *buf;
> > > > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > > > +
> > > > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > > > +             prterr("posix_memalign for buf");
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > > +     memset(buf, '\0', buf_size);
> > > > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > > > >
> > > > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > > > hugepage?
> > > > > >
> > > > >
> > > > > imo, I don't think so. My understanding is that this would be a rare
> > > > > edge case that happens when the system is constrained on memory, in
> > > > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > > > anyways.
> > > >
> > > > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > > > into stressing the filesystem more.  But I guess there's no good way for
> > > > fsx to know that something happened to it.  Unless there's some even
> > > > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> > > >
> > > > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > > > few thousand fsxops just to be careful?
> > > >
> > >
> > > I wonder.. is there test value in doing collapses to the target file as
> > > well, either as a standalone map/madvise command or a random thing
> > > hitched onto preexisting commands? If so, I could see how something like
> > > that could potentially lift the current init time only approach into
> > > something that occurs with frequency, which then could at the same time
> > > (again maybe randomly) reinvoke for internal buffers as well.
> > 
> > My understanding is that if a filesystem has support enabled for large
> > folios, then doing large writes/reads (which I believe is currently
> > supported in fsx via the -o flag) will already automatically test the
> > functionality of how the filesystem handles hugepages. I don't think
> > this would be different from what doing a collapse on the target file
> > would do.
> > 
> 
> Ah, that is a good point. So maybe not that useful to have something
> that would hook into writes. OTOH, fsx does a lot of random ops in the
> general case. I wonder how likely it is to sustain large folios in a
> typical long running test and whether explicit madvise calls thrown into
> the mix would make any difference at all.
> 
> I suppose there may also be an argument that doing collapses provides
> more test coverage than purely doing larger folio allocations at write
> time..? I don't know the code well enough to say whether there is any
> value there. FWIW, what I think is more interesting from the fsx side is
> the oddball sequences of operations that it can create to uncover
> similarly odd problems. IOW, in theory if we had a randomish "collapse
> target range before next operation," would that effectively provide more
> coverage with how the various supported ops interact with large folios
> over current behavior?
> 
> But anyways, this is all nebulous and strikes me more as maybe something
> interesting to play with as a potential future enhancement more than
> anything. BTW, is there any good way to measure use of large folios in
> general and/or on a particular file? I.e., collapse/split stats or some
> such thing..? Thanks.

I only know of two -- hooking the mm_filemap_add_to_page_cache
tracepoint, and running MADV_COLLAPSE to see if it returns an errno.

--D

> Brian
> 
> > 
> > Thanks,
> > Joanne
> > 
> > >
> > > All that said, this is new functionality and IIUC provides functional
> > > test coverage for a valid issue. IMO, it would be nice to get this
> > > merged as a baseline feature and explore these sort of enhancements as
> > > followon work. Just my .02.
> > >
> > > Brian
> > >
> > > > --D
> > > >
> > > > >
> > > > > Thanks,
> > > > > Joanne
> > > > >
> > > > > > > +             prterr("madvise collapse for buf");
> > > > > > > +             free(buf);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     return buf;
> > > > > > > +}
> > > > > > > +#else
> > > > > > > +static void *
> > > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > > +{
> > > > > > > +     return NULL;
> > > > > > > +}
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +static void
> > > > > > > +init_buffers(void)
> > > > > > > +{
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > > > +             original_buf[i] = random() % 256;
> > > > > > > +     if (hugepages) {
> > > > > > > +             long hugepage_size = get_hugepage_size();
> > > > > > > +             if (hugepage_size == -1) {
> > > > > > > +                     prterr("get_hugepage_size()");
> > > > > > > +                     exit(102);
> > > > > > > +             }
> > > > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > > > +             if (!good_buf) {
> > > > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > > > +                     exit(103);
> > > > > > > +             }
> > > > > > > +
> > > > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > > > +             if (!temp_buf) {
> > > > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > > > +                     exit(103);
> > > > > > > +             }
> > > > > > > +     } else {
> > > > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > > > +
> > > > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > > > +     }
> > > > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > > > +}
> > > > > > > +
> > > > > > >  static struct option longopts[] = {
> > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > >
> > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > >                                longopts, NULL)) != EOF)
> > > > > > >               switch (ch) {
> > > > > > >               case 'b':
> > > > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > > > >               case 'g':
> > > > > > >                       filldata = *optarg;
> > > > > > >                       break;
> > > > > > > +             case 'h':
> > > > > > > +                     #ifndef MADV_COLLAPSE
> > > > > >
> > > > > > Preprocessor directives should start at column 0, like most of the rest
> > > > > > of fstests.
> > > > > >
> > > > > > --D
> > > > > >
> > > >
> > >
> > 
> 
>
Joanne Koong Jan. 17, 2025, 9:18 p.m. UTC | #10
On Fri, Jan 17, 2025 at 5:25 AM Brian Foster <bfoster@redhat.com> wrote:
>
> On Thu, Jan 16, 2025 at 05:26:31PM -0800, Joanne Koong wrote:
> > On Thu, Jan 16, 2025 at 4:51 AM Brian Foster <bfoster@redhat.com> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 04:59:19PM -0800, Darrick J. Wong wrote:
> > > > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > > >
> > > > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > > on systems where THP capabilities are enabled.
> > > > > > >
> > > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > > against problems like this in the future.
> > > > > > >
> > > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > > ---
> > > > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > > > >
> > > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > > index 41933354..8d3a2e2c 100644
> > > > > > > --- a/ltp/fsx.c
> > > > > > > +++ b/ltp/fsx.c
> > > > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > > > >  int  aio = 0;
> > > > > > >  int  uring = 0;
> > > > > > >  int  mark_nr = 0;
> > > > > > > +int  hugepages = 0;                  /* -h flag */
> > > > > > >
> > > > > > >  int page_size;
> > > > > > >  int page_mask;
> > > > > > > @@ -2471,7 +2472,7 @@ void
> > > > > > >  usage(void)
> > > > > > >  {
> > > > > > >       fprintf(stdout, "usage: %s",
> > > > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > > > >       -f: flush and invalidate cache after I/O\n\
> > > > > > >       -g X: write character X instead of random generated data\n\
> > > > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > > > >
> > > > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > > > describe the switch if the support wasn't compiled in?
> > > > > >
> > > > > > e.g.
> > > > > >
> > > > > >         -g X: write character X instead of random generated data\n"
> > > > > > #ifdef MADV_COLLAPSE
> > > > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > > > #endif
> > > > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > >
> > > > > > (assuming I got the preprocessor and string construction goo right; I
> > > > > > might be a few cards short of a deck due to zombie attack earlier)
> > > > >
> > > > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > > > >
> > > > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > > > [...]
> > > > > > > +}
> > > > > > > +
> > > > > > > +#ifdef MADV_COLLAPSE
> > > > > > > +static void *
> > > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > > +{
> > > > > > > +     void *buf;
> > > > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > > > +
> > > > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > > > +             prterr("posix_memalign for buf");
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > > +     memset(buf, '\0', buf_size);
> > > > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > > > >
> > > > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > > > hugepage?
> > > > > >
> > > > >
> > > > > imo, I don't think so. My understanding is that this would be a rare
> > > > > edge case that happens when the system is constrained on memory, in
> > > > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > > > anyways.
> > > >
> > > > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > > > into stressing the filesystem more.  But I guess there's no good way for
> > > > fsx to know that something happened to it.  Unless there's some even
> > > > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> > > >
> > > > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > > > few thousand fsxops just to be careful?
> > > >
> > >
> > > I wonder.. is there test value in doing collapses to the target file as
> > > well, either as a standalone map/madvise command or a random thing
> > > hitched onto preexisting commands? If so, I could see how something like
> > > that could potentially lift the current init time only approach into
> > > something that occurs with frequency, which then could at the same time
> > > (again maybe randomly) reinvoke for internal buffers as well.
> >
> > My understanding is that if a filesystem has support enabled for large
> > folios, then doing large writes/reads (which I believe is currently
> > supported in fsx via the -o flag) will already automatically test the
> > functionality of how the filesystem handles hugepages. I don't think
> > this would be different from what doing a collapse on the target file
> > would do.
> >
>
> Ah, that is a good point. So maybe not that useful to have something
> that would hook into writes. OTOH, fsx does a lot of random ops in the
> general case. I wonder how likely it is to sustain large folios in a
> typical long running test and whether explicit madvise calls thrown into
> the mix would make any difference at all.

I think this would run into the same case where if there's enough
severe memory constraint to where reclaim has to break up hugepages in
the page cache, then explicit madvise calls to collapse these pages
back together would most likely fail.

>
> I suppose there may also be an argument that doing collapses provides
> more test coverage than purely doing larger folio allocations at write
> time..? I don't know the code well enough to say whether there is any
> value there. FWIW, what I think is more interesting from the fsx side is
> the oddball sequences of operations that it can create to uncover
> similarly odd problems. IOW, in theory if we had a randomish "collapse
> target range before next operation," would that effectively provide more
> coverage with how the various supported ops interact with large folios
> over current behavior?

I  don't think collapses would have any effect on test coverage. For
filesystems that don't support hugepages, the collapse would fail and
for filesystems that do support hugepages, the folio would already be
a hugepage and the collapse would be a no-op.


Thanks,
Joanne

>
> But anyways, this is all nebulous and strikes me more as maybe something
> interesting to play with as a potential future enhancement more than
> anything. BTW, is there any good way to measure use of large folios in
> general and/or on a particular file? I.e., collapse/split stats or some
> such thing..? Thanks.
>
> Brian
>
> >
> > Thanks,
> > Joanne
> >
> > >
> > > All that said, this is new functionality and IIUC provides functional
> > > test coverage for a valid issue. IMO, it would be nice to get this
> > > merged as a baseline feature and explore these sort of enhancements as
> > > followon work. Just my .02.
> > >
> > > Brian
> > >
> > > > --D
> > > >
> > > > >
> > > > > Thanks,
> > > > > Joanne
> > > > >
> > > > > > > +             prterr("madvise collapse for buf");
> > > > > > > +             free(buf);
> > > > > > > +             return NULL;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     return buf;
> > > > > > > +}
> > > > > > > +#else
> > > > > > > +static void *
> > > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > > +{
> > > > > > > +     return NULL;
> > > > > > > +}
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +static void
> > > > > > > +init_buffers(void)
> > > > > > > +{
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > > > +             original_buf[i] = random() % 256;
> > > > > > > +     if (hugepages) {
> > > > > > > +             long hugepage_size = get_hugepage_size();
> > > > > > > +             if (hugepage_size == -1) {
> > > > > > > +                     prterr("get_hugepage_size()");
> > > > > > > +                     exit(102);
> > > > > > > +             }
> > > > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > > > +             if (!good_buf) {
> > > > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > > > +                     exit(103);
> > > > > > > +             }
> > > > > > > +
> > > > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > > > +             if (!temp_buf) {
> > > > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > > > +                     exit(103);
> > > > > > > +             }
> > > > > > > +     } else {
> > > > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > > > +
> > > > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > > > +     }
> > > > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > > > +}
> > > > > > > +
> > > > > > >  static struct option longopts[] = {
> > > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > > >
> > > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > >                                longopts, NULL)) != EOF)
> > > > > > >               switch (ch) {
> > > > > > >               case 'b':
> > > > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > > > >               case 'g':
> > > > > > >                       filldata = *optarg;
> > > > > > >                       break;
> > > > > > > +             case 'h':
> > > > > > > +                     #ifndef MADV_COLLAPSE
> > > > > >
> > > > > > Preprocessor directives should start at column 0, like most of the rest
> > > > > > of fstests.
> > > > > >
> > > > > > --D
> > > > > >
> > > >
> > >
> >
>
Joanne Koong Jan. 17, 2025, 9:48 p.m. UTC | #11
On Thu, Jan 16, 2025 at 5:57 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Thu, Jan 16, 2025 at 05:03:41PM -0800, Joanne Koong wrote:
> > On Wed, Jan 15, 2025 at 4:59 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 04:47:30PM -0800, Joanne Koong wrote:
> > > > On Wed, Jan 15, 2025 at 1:37 PM Darrick J. Wong <djwong@kernel.org> wrote:
> > > > >
> > > > > On Wed, Jan 15, 2025 at 10:31:06AM -0800, Joanne Koong wrote:
> > > > > > Add support for reads/writes from buffers backed by hugepages.
> > > > > > This can be enabled through the '-h' flag. This flag should only be used
> > > > > > on systems where THP capabilities are enabled.
> > > > > >
> > > > > > This is motivated by a recent bug that was due to faulty handling of
> > > > > > userspace buffers backed by hugepages. This patch is a mitigation
> > > > > > against problems like this in the future.
> > > > > >
> > > > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > > > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > > > > > ---
> > > > > >  ltp/fsx.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > > >  1 file changed, 108 insertions(+), 11 deletions(-)
> > > > > >
> > > > > > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > > > > > index 41933354..8d3a2e2c 100644
> > > > > > --- a/ltp/fsx.c
> > > > > > +++ b/ltp/fsx.c
> > > > > > @@ -190,6 +190,7 @@ int       o_direct;                       /* -Z */
> > > > > >  int  aio = 0;
> > > > > >  int  uring = 0;
> > > > > >  int  mark_nr = 0;
> > > > > > +int  hugepages = 0;                  /* -h flag */
> > > > > >
> > > > > >  int page_size;
> > > > > >  int page_mask;
> > > > > > @@ -2471,7 +2472,7 @@ void
> > > > > >  usage(void)
> > > > > >  {
> > > > > >       fprintf(stdout, "usage: %s",
> > > > > > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > > > > > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> > > > > >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> > > > > >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> > > > > >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > > > > > @@ -2484,6 +2485,7 @@ usage(void)
> > > > > >       -e: pollute post-eof on size changes (default 0)\n\
> > > > > >       -f: flush and invalidate cache after I/O\n\
> > > > > >       -g X: write character X instead of random generated data\n\
> > > > > > +     -h hugepages: use buffers backed by hugepages for reads/writes\n\
> > > > >
> > > > > If this requires MADV_COLLAPSE, then perhaps the help text shouldn't
> > > > > describe the switch if the support wasn't compiled in?
> > > > >
> > > > > e.g.
> > > > >
> > > > >         -g X: write character X instead of random generated data\n"
> > > > > #ifdef MADV_COLLAPSE
> > > > > "       -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > > > > #endif
> > > > > "       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > >
> > > > > (assuming I got the preprocessor and string construction goo right; I
> > > > > might be a few cards short of a deck due to zombie attack earlier)
> > > >
> > > > Sounds great, I'll #ifdef out the help text -h line. Hope you feel better.
> > > > >
> > > > > >       -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > > > > >       -j logid: prefix debug log messsages with this id\n\
> > > > > >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> > > > [...]
> > > > > > +}
> > > > > > +
> > > > > > +#ifdef MADV_COLLAPSE
> > > > > > +static void *
> > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > +{
> > > > > > +     void *buf;
> > > > > > +     long buf_size = roundup(len, hugepage_size) + alignment;
> > > > > > +
> > > > > > +     if (posix_memalign(&buf, hugepage_size, buf_size)) {
> > > > > > +             prterr("posix_memalign for buf");
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > > +     memset(buf, '\0', buf_size);
> > > > > > +     if (madvise(buf, buf_size, MADV_COLLAPSE)) {
> > > > >
> > > > > If the fsx runs for a long period of time, will it be necessary to call
> > > > > MADV_COLLAPSE periodically to ensure that reclaim doesn't break up the
> > > > > hugepage?
> > > > >
> > > >
> > > > imo, I don't think so. My understanding is that this would be a rare
> > > > edge case that happens when the system is constrained on memory, in
> > > > which case subsequent calls to MADV_COLLAPSE would most likely fail
> > > > anyways.
> > >
> > > Hrmmm... well I /do/ like to run memory constrained VMs to prod reclaim
> > > into stressing the filesystem more.  But I guess there's no good way for
> > > fsx to know that something happened to it.  Unless there's some even
> > > goofier way to force a hugepage, like shmem/hugetlbfs (ugh!) :)
> >
> > I can't think of a better way to force a hugepage either. I believe
> > shmem and hugetlbfs would both require root privileges to do so, and
> > if i'm not mistaken, shmem hugepages are still subject to being broken
> > up by reclaim.
> >
> > >
> > > Will have to ponder hugepage renewasl -- maybe we should madvise every
> > > few thousand fsxops just to be careful?
> >
> > I can add this in, but on memory constrained VMs, would this be
> > effective? To me, it seems like in the majority of cases, subsequent
>
> They're not /so/ memory constrained that the initial collapsed page is
> likely to get split/reclaimed before your regression test finishes...
>
> > attempts at collapsing the broken pages back into a hugepage would
> > fail due to memory still being constrained. In which case, I guess
> > we'd exit the test altogether?
>
> ...but I was starting to wonder if this is something we'd actually want
> in a long soak test, just in case there are weird effects on the system
> after fsx has been running for a few days?
>
> >                                It kind of seems to me like if the user
> > wants to test out hugepages functionality of their filesystem, then
> > the onus is on them to run the test in an environment that can
> > adequately and consistently support hugepages.
>
> I guess the hard part about analyzing that is is that long soak tests
> aren't usually supposed to die on account of memory fragmentation.
> Hence me wondering if there's an "easy" way to get one huge page and not
> let it go.

For a long soak situation, I think the periodic collapses every few
thousands ops makes sense. Where the "-h" flag is interpreted as
best-effort (eg the test still continues on even if the subsequent
collapses fail instead of exiting the test altogether, we just make a
best effort at collapsing). I'll add this change to v4.


Thanks,
Joanne

>
> Anyway don't let my blathering hold this up; I think once you fix the
> help text and the #ifdef indenting around exit(86) this patch is good to
> go.
>
> --D
>
> >
> > Thanks,
> > Joanne
> >
> > >
> > > --D
> > >
> > > >
> > > > Thanks,
> > > > Joanne
> > > >
> > > > > > +             prterr("madvise collapse for buf");
> > > > > > +             free(buf);
> > > > > > +             return NULL;
> > > > > > +     }
> > > > > > +
> > > > > > +     return buf;
> > > > > > +}
> > > > > > +#else
> > > > > > +static void *
> > > > > > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
> > > > > > +{
> > > > > > +     return NULL;
> > > > > > +}
> > > > > > +#endif
> > > > > > +
> > > > > > +static void
> > > > > > +init_buffers(void)
> > > > > > +{
> > > > > > +     int i;
> > > > > > +
> > > > > > +     original_buf = (char *) malloc(maxfilelen);
> > > > > > +     for (i = 0; i < maxfilelen; i++)
> > > > > > +             original_buf[i] = random() % 256;
> > > > > > +     if (hugepages) {
> > > > > > +             long hugepage_size = get_hugepage_size();
> > > > > > +             if (hugepage_size == -1) {
> > > > > > +                     prterr("get_hugepage_size()");
> > > > > > +                     exit(102);
> > > > > > +             }
> > > > > > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
> > > > > > +             if (!good_buf) {
> > > > > > +                     prterr("init_hugepages_buf failed for good_buf");
> > > > > > +                     exit(103);
> > > > > > +             }
> > > > > > +
> > > > > > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
> > > > > > +             if (!temp_buf) {
> > > > > > +                     prterr("init_hugepages_buf failed for temp_buf");
> > > > > > +                     exit(103);
> > > > > > +             }
> > > > > > +     } else {
> > > > > > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > > > > > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > > > > > +
> > > > > > +             good_buf = calloc(1, good_buf_len);
> > > > > > +             temp_buf = calloc(1, temp_buf_len);
> > > > > > +     }
> > > > > > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > > > > > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > > > > > +}
> > > > > > +
> > > > > >  static struct option longopts[] = {
> > > > > >       {"replay-ops", required_argument, 0, 256},
> > > > > >       {"record-ops", optional_argument, 0, 255},
> > > > > > @@ -2883,7 +2980,7 @@ main(int argc, char **argv)
> > > > > >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> > > > > >
> > > > > >       while ((ch = getopt_long(argc, argv,
> > > > > > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > > > > >                                longopts, NULL)) != EOF)
> > > > > >               switch (ch) {
> > > > > >               case 'b':
> > > > > > @@ -2916,6 +3013,14 @@ main(int argc, char **argv)
> > > > > >               case 'g':
> > > > > >                       filldata = *optarg;
> > > > > >                       break;
> > > > > > +             case 'h':
> > > > > > +                     #ifndef MADV_COLLAPSE
> > > > >
> > > > > Preprocessor directives should start at column 0, like most of the rest
> > > > > of fstests.
> > > > >
> > > > > --D
> > > > >
diff mbox series

Patch

diff --git a/ltp/fsx.c b/ltp/fsx.c
index 41933354..8d3a2e2c 100644
--- a/ltp/fsx.c
+++ b/ltp/fsx.c
@@ -190,6 +190,7 @@  int	o_direct;			/* -Z */
 int	aio = 0;
 int	uring = 0;
 int	mark_nr = 0;
+int	hugepages = 0;                  /* -h flag */
 
 int page_size;
 int page_mask;
@@ -2471,7 +2472,7 @@  void
 usage(void)
 {
 	fprintf(stdout, "usage: %s",
-		"fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
+		"fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
 	   [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
 	   [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
 	   [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
@@ -2484,6 +2485,7 @@  usage(void)
 	-e: pollute post-eof on size changes (default 0)\n\
 	-f: flush and invalidate cache after I/O\n\
 	-g X: write character X instead of random generated data\n\
+	-h hugepages: use buffers backed by hugepages for reads/writes\n\
 	-i logdev: do integrity testing, logdev is the dm log writes device\n\
 	-j logid: prefix debug log messsages with this id\n\
 	-k: do not truncate existing file and use its size as upper bound on file size\n\
@@ -2856,6 +2858,101 @@  keep_running(void)
 	return numops-- != 0;
 }
 
+static long
+get_hugepage_size(void)
+{
+	const char str[] = "Hugepagesize:";
+	size_t str_len =  sizeof(str) - 1;
+	unsigned int hugepage_size = 0;
+	char buffer[64];
+	FILE *file;
+
+	file = fopen("/proc/meminfo", "r");
+	if (!file) {
+		prterr("get_hugepage_size: fopen /proc/meminfo");
+		return -1;
+	}
+	while (fgets(buffer, sizeof(buffer), file)) {
+		if (strncmp(buffer, str, str_len) == 0) {
+			sscanf(buffer + str_len, "%u", &hugepage_size);
+			break;
+		}
+	}
+	fclose(file);
+	if (!hugepage_size) {
+		prterr("get_hugepage_size: failed to find "
+			"hugepage size in /proc/meminfo\n");
+		return -1;
+	}
+
+	/* convert from KiB to bytes */
+	return hugepage_size << 10;
+}
+
+#ifdef MADV_COLLAPSE
+static void *
+init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
+{
+	void *buf;
+	long buf_size = roundup(len, hugepage_size) + alignment;
+
+	if (posix_memalign(&buf, hugepage_size, buf_size)) {
+		prterr("posix_memalign for buf");
+		return NULL;
+	}
+	memset(buf, '\0', buf_size);
+	if (madvise(buf, buf_size, MADV_COLLAPSE)) {
+		prterr("madvise collapse for buf");
+		free(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+#else
+static void *
+init_hugepages_buf(unsigned len, int hugepage_size, int alignment)
+{
+	return NULL;
+}
+#endif
+
+static void
+init_buffers(void)
+{
+	int i;
+
+	original_buf = (char *) malloc(maxfilelen);
+	for (i = 0; i < maxfilelen; i++)
+		original_buf[i] = random() % 256;
+	if (hugepages) {
+		long hugepage_size = get_hugepage_size();
+		if (hugepage_size == -1) {
+			prterr("get_hugepage_size()");
+			exit(102);
+		}
+		good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy);
+		if (!good_buf) {
+			prterr("init_hugepages_buf failed for good_buf");
+			exit(103);
+		}
+
+		temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy);
+		if (!temp_buf) {
+			prterr("init_hugepages_buf failed for temp_buf");
+			exit(103);
+		}
+	} else {
+		unsigned long good_buf_len = maxfilelen + writebdy;
+		unsigned long temp_buf_len = maxoplen + readbdy;
+
+		good_buf = calloc(1, good_buf_len);
+		temp_buf = calloc(1, temp_buf_len);
+	}
+	good_buf = round_ptr_up(good_buf, writebdy, 0);
+	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
+}
+
 static struct option longopts[] = {
 	{"replay-ops", required_argument, 0, 256},
 	{"record-ops", optional_argument, 0, 255},
@@ -2883,7 +2980,7 @@  main(int argc, char **argv)
 	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
 
 	while ((ch = getopt_long(argc, argv,
-				 "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
+				 "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
 				 longopts, NULL)) != EOF)
 		switch (ch) {
 		case 'b':
@@ -2916,6 +3013,14 @@  main(int argc, char **argv)
 		case 'g':
 			filldata = *optarg;
 			break;
+		case 'h':
+			#ifndef MADV_COLLAPSE
+				fprintf(stderr, "MADV_COLLAPSE not supported. "
+					"Can't support -h\n");
+				exit(86);
+			#endif
+			hugepages = 1;
+			break;
 		case 'i':
 			integrity = 1;
 			logdev = strdup(optarg);
@@ -3229,15 +3334,7 @@  main(int argc, char **argv)
 			exit(95);
 		}
 	}
-	original_buf = (char *) malloc(maxfilelen);
-	for (i = 0; i < maxfilelen; i++)
-		original_buf[i] = random() % 256;
-	good_buf = (char *) malloc(maxfilelen + writebdy);
-	good_buf = round_ptr_up(good_buf, writebdy, 0);
-	memset(good_buf, '\0', maxfilelen);
-	temp_buf = (char *) malloc(maxoplen + readbdy);
-	temp_buf = round_ptr_up(temp_buf, readbdy, 0);
-	memset(temp_buf, '\0', maxoplen);
+	init_buffers();
 	if (lite) {	/* zero entire existing file */
 		ssize_t written;