diff mbox series

[6/7] fsx: support FIEXCHANGE_RANGE

Message ID 167243878899.732172.1539601356241657286.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series fstests: atomic file updates | expand

Commit Message

Darrick J. Wong Dec. 30, 2022, 10:19 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Upgrade fsx to support exchanging file contents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 configure.ac          |    1 
 include/builddefs.in  |    1 
 ltp/Makefile          |    4 +
 ltp/fsx.c             |  160 ++++++++++++++++++++++++++++++++++++++++++++++++-
 m4/package_libcdev.m4 |   21 ++++++
 src/fiexchange.h      |  101 +++++++++++++++++++++++++++++++
 src/global.h          |    6 ++
 7 files changed, 292 insertions(+), 2 deletions(-)
 create mode 100644 src/fiexchange.h

Comments

Zorro Lang Feb. 28, 2023, 1:55 a.m. UTC | #1
On Fri, Dec 30, 2022 at 02:19:49PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Upgrade fsx to support exchanging file contents.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---

Hi Darrick,

I've merged most of patches of [NYE DELUGE 2/4], now I'm trying to merge
the rest of it this time.

This patch will get build warning [1] from autoconf, can you rebase this patch
to current for-next branch, and use autoupdate to update the configure.ac
and lib/autoconf/general.m4 ?

Thanks,
Zorro

[1]
autoconf
configure.ac:73: warning: The macro `AC_TRY_LINK' is obsolete.
configure.ac:73: You should run autoupdate.
./lib/autoconf/general.m4:2920: AC_TRY_LINK is expanded from...
m4/package_libcdev.m4:161: AC_HAVE_FIEXCHANGE is expanded from...
configure.ac:73: the top level
./configure \   
                --libexecdir=/usr/lib \
                --exec_prefix=/var/lib

>  configure.ac          |    1 
>  include/builddefs.in  |    1 
>  ltp/Makefile          |    4 +
>  ltp/fsx.c             |  160 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  m4/package_libcdev.m4 |   21 ++++++
>  src/fiexchange.h      |  101 +++++++++++++++++++++++++++++++
>  src/global.h          |    6 ++
>  7 files changed, 292 insertions(+), 2 deletions(-)
>  create mode 100644 src/fiexchange.h
> 
> 
> diff --git a/configure.ac b/configure.ac
> index e92bd6b26d..4687d8a3c0 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -70,6 +70,7 @@ AC_HAVE_SEEK_DATA
>  AC_HAVE_BMV_OF_SHARED
>  AC_HAVE_NFTW
>  AC_HAVE_RLIMIT_NOFILE
> +AC_HAVE_FIEXCHANGE

[snip]

>  	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
>  
>  	while ((ch = getopt_long(argc, argv,
> -				 "b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> +				 "0b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",

Looks like we nearly used up most of letters for fsx, to avoid some operations.

Maybe we can use a single option (e.g. -a means avoid) and suboptions to
help that. For example "-a xchg_range,clone_range,dedupe_range" to avoid
these 3 operations. Or use long option, e.g. --no-xchg-range, --no-clone-range
to replace short ones.

What do you think? (Anyway, that's not the problem of this patch)

Thanks,
Zorro

>  				 longopts, NULL)) != EOF)
>  		switch (ch) {
>  		case 'b':
> @@ -2747,6 +2898,9 @@ main(int argc, char **argv)
>  		case 'I':
>  			insert_range_calls = 0;
>  			break;
> +		case '0':
> +			xchg_range_calls = 0;
> +			break;
>  		case 'J':
>  			clone_range_calls = 0;
>  			break;
> @@ -2988,6 +3142,8 @@ main(int argc, char **argv)
>  		dedupe_range_calls = test_dedupe_range();
>  	if (copy_range_calls)
>  		copy_range_calls = test_copy_range();
> +	if (xchg_range_calls)
> +		xchg_range_calls = test_xchg_range();
>  
>  	while (numops == -1 || numops--)
>  		if (!test())
> diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
> index e1b381c16f..db663970c2 100644
> --- a/m4/package_libcdev.m4
> +++ b/m4/package_libcdev.m4
> @@ -157,3 +157,24 @@ AC_DEFUN([AC_HAVE_RLIMIT_NOFILE],
>         AC_MSG_RESULT(no))
>      AC_SUBST(have_rlimit_nofile)
>    ])
> +
> +#
> +# Check if we have a FIEXCHANGE_RANGE ioctl (Linux)
> +#
> +AC_DEFUN([AC_HAVE_FIEXCHANGE],
> +  [ AC_MSG_CHECKING([for FIEXCHANGE_RANGE])
> +    AC_TRY_LINK([
> +#define _GNU_SOURCE
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +#include <unistd.h>
> +#include <linux/fs.h>
> +#include <linux/fiexchange.h>
> +    ], [
> +         struct file_xchg_range fxr;
> +         ioctl(-1, FIEXCHANGE_RANGE, &fxr);
> +    ], have_fiexchange=yes
> +       AC_MSG_RESULT(yes),
> +       AC_MSG_RESULT(no))
> +    AC_SUBST(have_fiexchange)
> +  ])
> diff --git a/src/fiexchange.h b/src/fiexchange.h
> new file mode 100644
> index 0000000000..29b3ac0ff5
> --- /dev/null
> +++ b/src/fiexchange.h
> @@ -0,0 +1,101 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
> +/*
> + * FIEXCHANGE ioctl definitions, to facilitate exchanging parts of files.
> + *
> + * Copyright (C) 2022 Oracle.  All Rights Reserved.
> + *
> + * Author: Darrick J. Wong <djwong@kernel.org>
> + */
> +#ifndef _LINUX_FIEXCHANGE_H
> +#define _LINUX_FIEXCHANGE_H
> +
> +#include <linux/types.h>
> +
> +/*
> + * Exchange part of file1 with part of the file that this ioctl that is being
> + * called against (which we'll call file2).  Filesystems must be able to
> + * restart and complete the operation even after the system goes down.
> + */
> +struct file_xchg_range {
> +	__s64		file1_fd;
> +	__s64		file1_offset;	/* file1 offset, bytes */
> +	__s64		file2_offset;	/* file2 offset, bytes */
> +	__s64		length;		/* bytes to exchange */
> +
> +	__u64		flags;		/* see FILE_XCHG_RANGE_* below */
> +
> +	/* file2 metadata for optional freshness checks */
> +	__s64		file2_ino;	/* inode number */
> +	__s64		file2_mtime;	/* modification time */
> +	__s64		file2_ctime;	/* change time */
> +	__s32		file2_mtime_nsec; /* mod time, nsec */
> +	__s32		file2_ctime_nsec; /* change time, nsec */
> +
> +	__u64		pad[6];		/* must be zeroes */
> +};
> +
> +/*
> + * Atomic exchange operations are not required.  This relaxes the requirement
> + * that the filesystem must be able to complete the operation after a crash.
> + */
> +#define FILE_XCHG_RANGE_NONATOMIC	(1 << 0)
> +
> +/*
> + * Check that file2's inode number, mtime, and ctime against the values
> + * provided, and return -EBUSY if there isn't an exact match.
> + */
> +#define FILE_XCHG_RANGE_FILE2_FRESH	(1 << 1)
> +
> +/*
> + * Check that the file1's length is equal to file1_offset + length, and that
> + * file2's length is equal to file2_offset + length.  Returns -EDOM if there
> + * isn't an exact match.
> + */
> +#define FILE_XCHG_RANGE_FULL_FILES	(1 << 2)
> +
> +/*
> + * Exchange file data all the way to the ends of both files, and then exchange
> + * the file sizes.  This flag can be used to replace a file's contents with a
> + * different amount of data.  length will be ignored.
> + */
> +#define FILE_XCHG_RANGE_TO_EOF		(1 << 3)
> +
> +/* Flush all changes in file data and file metadata to disk before returning. */
> +#define FILE_XCHG_RANGE_FSYNC		(1 << 4)
> +
> +/* Dry run; do all the parameter verification but do not change anything. */
> +#define FILE_XCHG_RANGE_DRY_RUN		(1 << 5)
> +
> +/*
> + * Do not exchange any part of the range where file1's mapping is a hole.  This
> + * can be used to emulate scatter-gather atomic writes with a temp file.
> + */
> +#define FILE_XCHG_RANGE_SKIP_FILE1_HOLES (1 << 6)
> +
> +/*
> + * Commit the contents of file1 into file2 if file2 has the same inode number,
> + * mtime, and ctime as the arguments provided to the call.  The old contents of
> + * file2 will be moved to file1.
> + *
> + * With this flag, all committed information can be retrieved even if the
> + * system crashes or is rebooted.  This includes writing through or flushing a
> + * disk cache if present.  The call blocks until the device reports that the
> + * commit is complete.
> + *
> + * This flag should not be combined with NONATOMIC.  It can be combined with
> + * SKIP_FILE1_HOLES.
> + */
> +#define FILE_XCHG_RANGE_COMMIT		(FILE_XCHG_RANGE_FILE2_FRESH | \
> +					 FILE_XCHG_RANGE_FSYNC)
> +
> +#define FILE_XCHG_RANGE_ALL_FLAGS	(FILE_XCHG_RANGE_NONATOMIC | \
> +					 FILE_XCHG_RANGE_FILE2_FRESH | \
> +					 FILE_XCHG_RANGE_FULL_FILES | \
> +					 FILE_XCHG_RANGE_TO_EOF | \
> +					 FILE_XCHG_RANGE_FSYNC | \
> +					 FILE_XCHG_RANGE_DRY_RUN | \
> +					 FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
> +
> +#define FIEXCHANGE_RANGE	_IOWR('X', 129, struct file_xchg_range)
> +
> +#endif /* _LINUX_FIEXCHANGE_H */
> diff --git a/src/global.h b/src/global.h
> index b44070993c..49570ef117 100644
> --- a/src/global.h
> +++ b/src/global.h
> @@ -171,6 +171,12 @@
>  #include <sys/mman.h>
>  #endif
>  
> +#ifdef HAVE_FIEXCHANGE
> +# include <linux/fiexchange.h>
> +#else
> +# include "fiexchange.h"
> +#endif
> +
>  static inline unsigned long long
>  rounddown_64(unsigned long long x, unsigned int y)
>  {
>
Darrick J. Wong March 1, 2023, 2:56 a.m. UTC | #2
On Tue, Feb 28, 2023 at 09:55:28AM +0800, Zorro Lang wrote:
> On Fri, Dec 30, 2022 at 02:19:49PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Upgrade fsx to support exchanging file contents.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > ---
> 
> Hi Darrick,
> 
> I've merged most of patches of [NYE DELUGE 2/4], now I'm trying to merge
> the rest of it this time.
> 
> This patch will get build warning [1] from autoconf, can you rebase this patch
> to current for-next branch, and use autoupdate to update the configure.ac
> and lib/autoconf/general.m4 ?

Will do.  Thanks for merging this all, I really appreciate it!

--D

> Thanks,
> Zorro
> 
> [1]
> autoconf
> configure.ac:73: warning: The macro `AC_TRY_LINK' is obsolete.
> configure.ac:73: You should run autoupdate.
> ./lib/autoconf/general.m4:2920: AC_TRY_LINK is expanded from...
> m4/package_libcdev.m4:161: AC_HAVE_FIEXCHANGE is expanded from...
> configure.ac:73: the top level
> ./configure \   
>                 --libexecdir=/usr/lib \
>                 --exec_prefix=/var/lib
> 
> >  configure.ac          |    1 
> >  include/builddefs.in  |    1 
> >  ltp/Makefile          |    4 +
> >  ltp/fsx.c             |  160 ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  m4/package_libcdev.m4 |   21 ++++++
> >  src/fiexchange.h      |  101 +++++++++++++++++++++++++++++++
> >  src/global.h          |    6 ++
> >  7 files changed, 292 insertions(+), 2 deletions(-)
> >  create mode 100644 src/fiexchange.h
> > 
> > 
> > diff --git a/configure.ac b/configure.ac
> > index e92bd6b26d..4687d8a3c0 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -70,6 +70,7 @@ AC_HAVE_SEEK_DATA
> >  AC_HAVE_BMV_OF_SHARED
> >  AC_HAVE_NFTW
> >  AC_HAVE_RLIMIT_NOFILE
> > +AC_HAVE_FIEXCHANGE
> 
> [snip]
> 
> >  	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> >  
> >  	while ((ch = getopt_long(argc, argv,
> > -				 "b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > +				 "0b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> 
> Looks like we nearly used up most of letters for fsx, to avoid some operations.
> 
> Maybe we can use a single option (e.g. -a means avoid) and suboptions to
> help that. For example "-a xchg_range,clone_range,dedupe_range" to avoid
> these 3 operations. Or use long option, e.g. --no-xchg-range, --no-clone-range
> to replace short ones.
> 
> What do you think? (Anyway, that's not the problem of this patch)
> 
> Thanks,
> Zorro
> 
> >  				 longopts, NULL)) != EOF)
> >  		switch (ch) {
> >  		case 'b':
> > @@ -2747,6 +2898,9 @@ main(int argc, char **argv)
> >  		case 'I':
> >  			insert_range_calls = 0;
> >  			break;
> > +		case '0':
> > +			xchg_range_calls = 0;
> > +			break;
> >  		case 'J':
> >  			clone_range_calls = 0;
> >  			break;
> > @@ -2988,6 +3142,8 @@ main(int argc, char **argv)
> >  		dedupe_range_calls = test_dedupe_range();
> >  	if (copy_range_calls)
> >  		copy_range_calls = test_copy_range();
> > +	if (xchg_range_calls)
> > +		xchg_range_calls = test_xchg_range();
> >  
> >  	while (numops == -1 || numops--)
> >  		if (!test())
> > diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
> > index e1b381c16f..db663970c2 100644
> > --- a/m4/package_libcdev.m4
> > +++ b/m4/package_libcdev.m4
> > @@ -157,3 +157,24 @@ AC_DEFUN([AC_HAVE_RLIMIT_NOFILE],
> >         AC_MSG_RESULT(no))
> >      AC_SUBST(have_rlimit_nofile)
> >    ])
> > +
> > +#
> > +# Check if we have a FIEXCHANGE_RANGE ioctl (Linux)
> > +#
> > +AC_DEFUN([AC_HAVE_FIEXCHANGE],
> > +  [ AC_MSG_CHECKING([for FIEXCHANGE_RANGE])
> > +    AC_TRY_LINK([
> > +#define _GNU_SOURCE
> > +#include <sys/syscall.h>
> > +#include <sys/ioctl.h>
> > +#include <unistd.h>
> > +#include <linux/fs.h>
> > +#include <linux/fiexchange.h>
> > +    ], [
> > +         struct file_xchg_range fxr;
> > +         ioctl(-1, FIEXCHANGE_RANGE, &fxr);
> > +    ], have_fiexchange=yes
> > +       AC_MSG_RESULT(yes),
> > +       AC_MSG_RESULT(no))
> > +    AC_SUBST(have_fiexchange)
> > +  ])
> > diff --git a/src/fiexchange.h b/src/fiexchange.h
> > new file mode 100644
> > index 0000000000..29b3ac0ff5
> > --- /dev/null
> > +++ b/src/fiexchange.h
> > @@ -0,0 +1,101 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
> > +/*
> > + * FIEXCHANGE ioctl definitions, to facilitate exchanging parts of files.
> > + *
> > + * Copyright (C) 2022 Oracle.  All Rights Reserved.
> > + *
> > + * Author: Darrick J. Wong <djwong@kernel.org>
> > + */
> > +#ifndef _LINUX_FIEXCHANGE_H
> > +#define _LINUX_FIEXCHANGE_H
> > +
> > +#include <linux/types.h>
> > +
> > +/*
> > + * Exchange part of file1 with part of the file that this ioctl that is being
> > + * called against (which we'll call file2).  Filesystems must be able to
> > + * restart and complete the operation even after the system goes down.
> > + */
> > +struct file_xchg_range {
> > +	__s64		file1_fd;
> > +	__s64		file1_offset;	/* file1 offset, bytes */
> > +	__s64		file2_offset;	/* file2 offset, bytes */
> > +	__s64		length;		/* bytes to exchange */
> > +
> > +	__u64		flags;		/* see FILE_XCHG_RANGE_* below */
> > +
> > +	/* file2 metadata for optional freshness checks */
> > +	__s64		file2_ino;	/* inode number */
> > +	__s64		file2_mtime;	/* modification time */
> > +	__s64		file2_ctime;	/* change time */
> > +	__s32		file2_mtime_nsec; /* mod time, nsec */
> > +	__s32		file2_ctime_nsec; /* change time, nsec */
> > +
> > +	__u64		pad[6];		/* must be zeroes */
> > +};
> > +
> > +/*
> > + * Atomic exchange operations are not required.  This relaxes the requirement
> > + * that the filesystem must be able to complete the operation after a crash.
> > + */
> > +#define FILE_XCHG_RANGE_NONATOMIC	(1 << 0)
> > +
> > +/*
> > + * Check that file2's inode number, mtime, and ctime against the values
> > + * provided, and return -EBUSY if there isn't an exact match.
> > + */
> > +#define FILE_XCHG_RANGE_FILE2_FRESH	(1 << 1)
> > +
> > +/*
> > + * Check that the file1's length is equal to file1_offset + length, and that
> > + * file2's length is equal to file2_offset + length.  Returns -EDOM if there
> > + * isn't an exact match.
> > + */
> > +#define FILE_XCHG_RANGE_FULL_FILES	(1 << 2)
> > +
> > +/*
> > + * Exchange file data all the way to the ends of both files, and then exchange
> > + * the file sizes.  This flag can be used to replace a file's contents with a
> > + * different amount of data.  length will be ignored.
> > + */
> > +#define FILE_XCHG_RANGE_TO_EOF		(1 << 3)
> > +
> > +/* Flush all changes in file data and file metadata to disk before returning. */
> > +#define FILE_XCHG_RANGE_FSYNC		(1 << 4)
> > +
> > +/* Dry run; do all the parameter verification but do not change anything. */
> > +#define FILE_XCHG_RANGE_DRY_RUN		(1 << 5)
> > +
> > +/*
> > + * Do not exchange any part of the range where file1's mapping is a hole.  This
> > + * can be used to emulate scatter-gather atomic writes with a temp file.
> > + */
> > +#define FILE_XCHG_RANGE_SKIP_FILE1_HOLES (1 << 6)
> > +
> > +/*
> > + * Commit the contents of file1 into file2 if file2 has the same inode number,
> > + * mtime, and ctime as the arguments provided to the call.  The old contents of
> > + * file2 will be moved to file1.
> > + *
> > + * With this flag, all committed information can be retrieved even if the
> > + * system crashes or is rebooted.  This includes writing through or flushing a
> > + * disk cache if present.  The call blocks until the device reports that the
> > + * commit is complete.
> > + *
> > + * This flag should not be combined with NONATOMIC.  It can be combined with
> > + * SKIP_FILE1_HOLES.
> > + */
> > +#define FILE_XCHG_RANGE_COMMIT		(FILE_XCHG_RANGE_FILE2_FRESH | \
> > +					 FILE_XCHG_RANGE_FSYNC)
> > +
> > +#define FILE_XCHG_RANGE_ALL_FLAGS	(FILE_XCHG_RANGE_NONATOMIC | \
> > +					 FILE_XCHG_RANGE_FILE2_FRESH | \
> > +					 FILE_XCHG_RANGE_FULL_FILES | \
> > +					 FILE_XCHG_RANGE_TO_EOF | \
> > +					 FILE_XCHG_RANGE_FSYNC | \
> > +					 FILE_XCHG_RANGE_DRY_RUN | \
> > +					 FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
> > +
> > +#define FIEXCHANGE_RANGE	_IOWR('X', 129, struct file_xchg_range)
> > +
> > +#endif /* _LINUX_FIEXCHANGE_H */
> > diff --git a/src/global.h b/src/global.h
> > index b44070993c..49570ef117 100644
> > --- a/src/global.h
> > +++ b/src/global.h
> > @@ -171,6 +171,12 @@
> >  #include <sys/mman.h>
> >  #endif
> >  
> > +#ifdef HAVE_FIEXCHANGE
> > +# include <linux/fiexchange.h>
> > +#else
> > +# include "fiexchange.h"
> > +#endif
> > +
> >  static inline unsigned long long
> >  rounddown_64(unsigned long long x, unsigned int y)
> >  {
> > 
>
diff mbox series

Patch

diff --git a/configure.ac b/configure.ac
index e92bd6b26d..4687d8a3c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -70,6 +70,7 @@  AC_HAVE_SEEK_DATA
 AC_HAVE_BMV_OF_SHARED
 AC_HAVE_NFTW
 AC_HAVE_RLIMIT_NOFILE
+AC_HAVE_FIEXCHANGE
 
 AC_CHECK_FUNCS([renameat2])
 AC_CHECK_FUNCS([reallocarray])
diff --git a/include/builddefs.in b/include/builddefs.in
index dab10c968f..969acf0da2 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -72,6 +72,7 @@  HAVE_SEEK_DATA = @have_seek_data@
 HAVE_NFTW = @have_nftw@
 HAVE_BMV_OF_SHARED = @have_bmv_of_shared@
 HAVE_RLIMIT_NOFILE = @have_rlimit_nofile@
+HAVE_FIEXCHANGE = @have_fiexchange@
 
 GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
 
diff --git a/ltp/Makefile b/ltp/Makefile
index 85f634145c..c2b70d896e 100644
--- a/ltp/Makefile
+++ b/ltp/Makefile
@@ -36,6 +36,10 @@  ifeq ($(HAVE_COPY_FILE_RANGE),yes)
 LCFLAGS += -DHAVE_COPY_FILE_RANGE
 endif
 
+ifeq ($(HAVE_FIEXCHANGE),yes)
+LCFLAGS += -DHAVE_FIEXCHANGE
+endif
+
 default: depend $(TARGETS)
 
 depend: .dep
diff --git a/ltp/fsx.c b/ltp/fsx.c
index 12c2cc33bf..ee4b8fe45d 100644
--- a/ltp/fsx.c
+++ b/ltp/fsx.c
@@ -111,6 +111,7 @@  enum {
 	OP_CLONE_RANGE,
 	OP_DEDUPE_RANGE,
 	OP_COPY_RANGE,
+	OP_EXCHANGE_RANGE,
 	OP_MAX_FULL,
 
 	/* integrity operations */
@@ -175,6 +176,7 @@  int	check_file = 0;			/* -X flag enables */
 int	clone_range_calls = 1;		/* -J flag disables */
 int	dedupe_range_calls = 1;		/* -B flag disables */
 int	copy_range_calls = 1;		/* -E flag disables */
+int	xchg_range_calls = 1;		/* -0 flag disables */
 int	integrity = 0;			/* -i flag */
 int	fsxgoodfd = 0;
 int	o_direct;			/* -Z */
@@ -268,6 +270,7 @@  static const char *op_names[] = {
 	[OP_DEDUPE_RANGE] = "dedupe_range",
 	[OP_COPY_RANGE] = "copy_range",
 	[OP_FSYNC] = "fsync",
+	[OP_EXCHANGE_RANGE] = "xchg_range",
 };
 
 static const char *op_name(int operation)
@@ -452,6 +455,20 @@  logdump(void)
 			if (overlap)
 				prt("\t******IIII");
 			break;
+		case OP_EXCHANGE_RANGE:
+			prt("XCHG 0x%x thru 0x%x\t(0x%x bytes) to 0x%x thru 0x%x",
+			    lp->args[0], lp->args[0] + lp->args[1] - 1,
+			    lp->args[1],
+			    lp->args[2], lp->args[2] + lp->args[1] - 1);
+			overlap2 = badoff >= lp->args[2] &&
+				  badoff < lp->args[2] + lp->args[1];
+			if (overlap && overlap2)
+				prt("\tXXXX**XXXX");
+			else if (overlap)
+				prt("\tXXXX******");
+			else if (overlap2)
+				prt("\t******XXXX");
+			break;
 		case OP_CLONE_RANGE:
 			prt("CLONE 0x%x thru 0x%x\t(0x%x bytes) to 0x%x thru 0x%x",
 			    lp->args[0], lp->args[0] + lp->args[1] - 1,
@@ -1369,6 +1386,116 @@  do_insert_range(unsigned offset, unsigned length)
 }
 #endif
 
+#ifdef FIEXCHANGE_RANGE
+static __u64 swap_flags = 0;
+
+int
+test_xchg_range(void)
+{
+	struct file_xchg_range	fsr = {
+		.file1_fd = fd,
+		.flags = FILE_XCHG_RANGE_DRY_RUN | swap_flags,
+	};
+	int ret, e;
+
+retry:
+	ret = ioctl(fd, FIEXCHANGE_RANGE, &fsr);
+	e = ret < 0 ? errno : 0;
+	if (e == EOPNOTSUPP && !(swap_flags & FILE_XCHG_RANGE_NONATOMIC)) {
+		/*
+		 * If the call fails with atomic mode, try again with non
+		 * atomic mode.
+		 */
+		swap_flags = FILE_XCHG_RANGE_NONATOMIC;
+		fsr.flags |= swap_flags;
+		goto retry;
+	}
+	if (e == EOPNOTSUPP || errno == ENOTTY) {
+		if (!quiet)
+			fprintf(stderr,
+				"main: filesystem does not support "
+				"exchange range, disabling!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+void
+do_xchg_range(unsigned offset, unsigned length, unsigned dest)
+{
+	struct file_xchg_range	fsr = {
+		.file1_fd = fd,
+		.file1_offset = offset,
+		.file2_offset = dest,
+		.length = length,
+		.flags = swap_flags,
+	};
+	void *p;
+
+	if (length == 0) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping zero length exchange range\n");
+		log5(OP_EXCHANGE_RANGE, offset, length, dest, FL_SKIPPED);
+		return;
+	}
+
+	if ((loff_t)offset >= file_size || (loff_t)dest >= file_size) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping exchange range behind EOF\n");
+		log5(OP_EXCHANGE_RANGE, offset, length, dest, FL_SKIPPED);
+		return;
+	}
+
+	p = malloc(length);
+	if (!p) {
+		if (!quiet && testcalls > simulatedopcount)
+			prt("skipping exchange range due to ENOMEM\n");
+		log5(OP_EXCHANGE_RANGE, offset, length, dest, FL_SKIPPED);
+		return;
+	}
+
+	log5(OP_EXCHANGE_RANGE, offset, length, dest, FL_NONE);
+
+	if (testcalls <= simulatedopcount)
+		goto out_free;
+
+	if ((progressinterval && testcalls % progressinterval == 0) ||
+	    (debug && (monitorstart == -1 || monitorend == -1 ||
+		       dest <= monitorstart || dest + length <= monitorend))) {
+		prt("%lu swap\tfrom 0x%x to 0x%x, (0x%x bytes) at 0x%x\n",
+			testcalls, offset, offset+length, length, dest);
+	}
+
+	if (ioctl(fd, FIEXCHANGE_RANGE, &fsr) == -1) {
+		prt("exchange range: 0x%x to 0x%x at 0x%x\n", offset,
+				offset + length, dest);
+		prterr("do_xchg_range: FIEXCHANGE_RANGE");
+		report_failure(161);
+		goto out_free;
+	}
+
+	memcpy(p, good_buf + offset, length);
+	memcpy(good_buf + offset, good_buf + dest, length);
+	memcpy(good_buf + dest, p, length);
+out_free:
+	free(p);
+}
+
+#else
+int
+test_xchg_range(void)
+{
+	return 0;
+}
+
+void
+do_xchg_range(unsigned offset, unsigned length, unsigned dest)
+{
+	return;
+}
+#endif
+
 #ifdef FICLONERANGE
 int
 test_clone_range(void)
@@ -1856,6 +1983,7 @@  static int
 op_args_count(int operation)
 {
 	switch (operation) {
+	case OP_EXCHANGE_RANGE:
 	case OP_CLONE_RANGE:
 	case OP_DEDUPE_RANGE:
 	case OP_COPY_RANGE:
@@ -2053,6 +2181,9 @@  test(void)
 	case OP_COPY_RANGE:
 		generate_dest_range(true, maxfilelen, &offset, &size, &offset2);
 		break;
+	case OP_EXCHANGE_RANGE:
+		generate_dest_range(false, file_size, &offset, &size, &offset2);
+		break;
 	}
 
 have_op:
@@ -2096,6 +2227,12 @@  test(void)
 			goto out;
 		}
 		break;
+	case OP_EXCHANGE_RANGE:
+		if (!xchg_range_calls) {
+			log5(op, offset, size, offset2, FL_SKIPPED);
+			goto out;
+		}
+		break;
 	case OP_CLONE_RANGE:
 		if (!clone_range_calls) {
 			log5(op, offset, size, offset2, FL_SKIPPED);
@@ -2180,6 +2317,18 @@  test(void)
 
 		do_insert_range(offset, size);
 		break;
+	case OP_EXCHANGE_RANGE:
+		if (size == 0) {
+			log5(OP_EXCHANGE_RANGE, offset, size, offset2, FL_SKIPPED);
+			goto out;
+		}
+		if (offset2 + size > maxfilelen) {
+			log5(OP_EXCHANGE_RANGE, offset, size, offset2, FL_SKIPPED);
+			goto out;
+		}
+
+		do_xchg_range(offset, size, offset2);
+		break;
 	case OP_CLONE_RANGE:
 		if (size == 0) {
 			log5(OP_CLONE_RANGE, offset, size, offset2, FL_SKIPPED);
@@ -2294,6 +2443,9 @@  usage(void)
 #ifdef HAVE_COPY_FILE_RANGE
 "	-E: Do not use copy range calls\n"
 #endif
+#ifdef FIEXCHANGE_RANGE
+"	-0: Do not use exchange range calls\n"
+#endif
 "	-L: fsxLite - no file creations & no file size changes\n\
 	-N numops: total # operations to do (default infinity)\n\
 	-O: use oplen (see -o flag) for every op (default random)\n\
@@ -2608,12 +2760,11 @@  main(int argc, char **argv)
 	page_size = getpagesize();
 	page_mask = page_size - 1;
 	mmap_mask = page_mask;
-	
 
 	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
 
 	while ((ch = getopt_long(argc, argv,
-				 "b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",
+				 "0b:c:dfg:i:j:kl:m:no:p:qr:s:t:w:xyABD:EFJKHzCILN:OP:RS:UWXZ",
 				 longopts, NULL)) != EOF)
 		switch (ch) {
 		case 'b':
@@ -2747,6 +2898,9 @@  main(int argc, char **argv)
 		case 'I':
 			insert_range_calls = 0;
 			break;
+		case '0':
+			xchg_range_calls = 0;
+			break;
 		case 'J':
 			clone_range_calls = 0;
 			break;
@@ -2988,6 +3142,8 @@  main(int argc, char **argv)
 		dedupe_range_calls = test_dedupe_range();
 	if (copy_range_calls)
 		copy_range_calls = test_copy_range();
+	if (xchg_range_calls)
+		xchg_range_calls = test_xchg_range();
 
 	while (numops == -1 || numops--)
 		if (!test())
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index e1b381c16f..db663970c2 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -157,3 +157,24 @@  AC_DEFUN([AC_HAVE_RLIMIT_NOFILE],
        AC_MSG_RESULT(no))
     AC_SUBST(have_rlimit_nofile)
   ])
+
+#
+# Check if we have a FIEXCHANGE_RANGE ioctl (Linux)
+#
+AC_DEFUN([AC_HAVE_FIEXCHANGE],
+  [ AC_MSG_CHECKING([for FIEXCHANGE_RANGE])
+    AC_TRY_LINK([
+#define _GNU_SOURCE
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <linux/fs.h>
+#include <linux/fiexchange.h>
+    ], [
+         struct file_xchg_range fxr;
+         ioctl(-1, FIEXCHANGE_RANGE, &fxr);
+    ], have_fiexchange=yes
+       AC_MSG_RESULT(yes),
+       AC_MSG_RESULT(no))
+    AC_SUBST(have_fiexchange)
+  ])
diff --git a/src/fiexchange.h b/src/fiexchange.h
new file mode 100644
index 0000000000..29b3ac0ff5
--- /dev/null
+++ b/src/fiexchange.h
@@ -0,0 +1,101 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
+/*
+ * FIEXCHANGE ioctl definitions, to facilitate exchanging parts of files.
+ *
+ * Copyright (C) 2022 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef _LINUX_FIEXCHANGE_H
+#define _LINUX_FIEXCHANGE_H
+
+#include <linux/types.h>
+
+/*
+ * Exchange part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2).  Filesystems must be able to
+ * restart and complete the operation even after the system goes down.
+ */
+struct file_xchg_range {
+	__s64		file1_fd;
+	__s64		file1_offset;	/* file1 offset, bytes */
+	__s64		file2_offset;	/* file2 offset, bytes */
+	__s64		length;		/* bytes to exchange */
+
+	__u64		flags;		/* see FILE_XCHG_RANGE_* below */
+
+	/* file2 metadata for optional freshness checks */
+	__s64		file2_ino;	/* inode number */
+	__s64		file2_mtime;	/* modification time */
+	__s64		file2_ctime;	/* change time */
+	__s32		file2_mtime_nsec; /* mod time, nsec */
+	__s32		file2_ctime_nsec; /* change time, nsec */
+
+	__u64		pad[6];		/* must be zeroes */
+};
+
+/*
+ * Atomic exchange operations are not required.  This relaxes the requirement
+ * that the filesystem must be able to complete the operation after a crash.
+ */
+#define FILE_XCHG_RANGE_NONATOMIC	(1 << 0)
+
+/*
+ * Check that file2's inode number, mtime, and ctime against the values
+ * provided, and return -EBUSY if there isn't an exact match.
+ */
+#define FILE_XCHG_RANGE_FILE2_FRESH	(1 << 1)
+
+/*
+ * Check that the file1's length is equal to file1_offset + length, and that
+ * file2's length is equal to file2_offset + length.  Returns -EDOM if there
+ * isn't an exact match.
+ */
+#define FILE_XCHG_RANGE_FULL_FILES	(1 << 2)
+
+/*
+ * Exchange file data all the way to the ends of both files, and then exchange
+ * the file sizes.  This flag can be used to replace a file's contents with a
+ * different amount of data.  length will be ignored.
+ */
+#define FILE_XCHG_RANGE_TO_EOF		(1 << 3)
+
+/* Flush all changes in file data and file metadata to disk before returning. */
+#define FILE_XCHG_RANGE_FSYNC		(1 << 4)
+
+/* Dry run; do all the parameter verification but do not change anything. */
+#define FILE_XCHG_RANGE_DRY_RUN		(1 << 5)
+
+/*
+ * Do not exchange any part of the range where file1's mapping is a hole.  This
+ * can be used to emulate scatter-gather atomic writes with a temp file.
+ */
+#define FILE_XCHG_RANGE_SKIP_FILE1_HOLES (1 << 6)
+
+/*
+ * Commit the contents of file1 into file2 if file2 has the same inode number,
+ * mtime, and ctime as the arguments provided to the call.  The old contents of
+ * file2 will be moved to file1.
+ *
+ * With this flag, all committed information can be retrieved even if the
+ * system crashes or is rebooted.  This includes writing through or flushing a
+ * disk cache if present.  The call blocks until the device reports that the
+ * commit is complete.
+ *
+ * This flag should not be combined with NONATOMIC.  It can be combined with
+ * SKIP_FILE1_HOLES.
+ */
+#define FILE_XCHG_RANGE_COMMIT		(FILE_XCHG_RANGE_FILE2_FRESH | \
+					 FILE_XCHG_RANGE_FSYNC)
+
+#define FILE_XCHG_RANGE_ALL_FLAGS	(FILE_XCHG_RANGE_NONATOMIC | \
+					 FILE_XCHG_RANGE_FILE2_FRESH | \
+					 FILE_XCHG_RANGE_FULL_FILES | \
+					 FILE_XCHG_RANGE_TO_EOF | \
+					 FILE_XCHG_RANGE_FSYNC | \
+					 FILE_XCHG_RANGE_DRY_RUN | \
+					 FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
+
+#define FIEXCHANGE_RANGE	_IOWR('X', 129, struct file_xchg_range)
+
+#endif /* _LINUX_FIEXCHANGE_H */
diff --git a/src/global.h b/src/global.h
index b44070993c..49570ef117 100644
--- a/src/global.h
+++ b/src/global.h
@@ -171,6 +171,12 @@ 
 #include <sys/mman.h>
 #endif
 
+#ifdef HAVE_FIEXCHANGE
+# include <linux/fiexchange.h>
+#else
+# include "fiexchange.h"
+#endif
+
 static inline unsigned long long
 rounddown_64(unsigned long long x, unsigned int y)
 {