diff mbox series

[02/14] xfs: introduce new file range exchange ioctls

Message ID 170900011673.938268.12940080187778287002.stgit@frogsfrogsfrogs (mailing list archive)
State Superseded
Headers show
Series [01/14] vfs: export remap and write check helpers | expand

Commit Message

Darrick J. Wong Feb. 27, 2024, 2:21 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Introduce a pair of new ioctls to handle exchanging ranges of bytes
between files.  The goal here is to perform the exchange atomically with
respect to applications -- either they see the file contents before the
exchange or they see that A-B is now B-A, even if the kernel crashes.

The simpler of the two ioctls is XFS_IOC_EXCHANGE_RANGE, which performs
the exchange unconditionally.  XFS_IOC_COMMIT_RANGE, on the other hand,
requires the caller to sample the file attributes of one of the files
participating in the exchange, and aborts the exchange if that file has
changed in the meantime (presumably by another thread).

For userspace programs the goal is to enable defragmentation of files
(it's a replacement for the old swapext ioctl), the atomic commit of
sparse updates to an indexed data file, or emulation of atomic writes on
software defined storage servers.

My original goal with all this code was to make it so that online repair
can build a replacement directory or xattr structure in a temporary file
and commit the repair by atomically exchanging all the data blocks
between the two files.  However, I needed a way to test this mechanism
thoroughly, so I've been evolving an ioctl interface since then.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/Makefile        |    1 
 fs/xfs/libxfs/xfs_fs.h |   72 +++++++++++
 fs/xfs/xfs_exchrange.c |  307 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_exchrange.h |   38 ++++++
 fs/xfs/xfs_ioctl.c     |   89 ++++++++++++++
 5 files changed, 507 insertions(+)
 create mode 100644 fs/xfs/xfs_exchrange.c
 create mode 100644 fs/xfs/xfs_exchrange.h

Comments

Christoph Hellwig Feb. 28, 2024, 3:44 p.m. UTC | #1
On Mon, Feb 26, 2024 at 06:21:23PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Introduce a pair of new ioctls to handle exchanging ranges of bytes
> between files.  The goal here is to perform the exchange atomically with
> respect to applications -- either they see the file contents before the
> exchange or they see that A-B is now B-A, even if the kernel crashes.
> 
> The simpler of the two ioctls is XFS_IOC_EXCHANGE_RANGE, which performs
> the exchange unconditionally.  XFS_IOC_COMMIT_RANGE, on the other hand,
> requires the caller to sample the file attributes of one of the files
> participating in the exchange, and aborts the exchange if that file has
> changed in the meantime (presumably by another thread).

So per all the discussions, wouldn't it make sense to separate out
XFS_IOC_COMMIT_RANGE (plus the new start commit one later), and if
discussions are still going on just get XFS_IOC_EXCHANGE_RANGE
done ASAP to go on with online repair, and give XFS_IOC_COMMIT_RANGE
enough time to discuss the finer details?

> +struct xfs_exch_range {
> +	__s64		file1_fd;

I should have noticed this last time, by why are we passing a fd
as a 64-bit value when it actually just is a 32-bit value in syscalls?
(same for commit).

> +	if (((fxr->file1->f_flags | fxr->file2->f_flags) & (__O_SYNC | O_DSYNC)) ||

Nit: overly long line here.

> +	if (fxr->flags & ~(XFS_EXCHRANGE_ALL_FLAGS | __XFS_EXCHRANGE_CHECK_FRESH2))

.. and here

> +	/*
> +	 * The ioctl enforces that src and dest files are on the same mount.
> +	 * However, they only need to be on the same file system.
> +	 */
> +	if (inode1->i_sb != inode2->i_sb)
> +		return -EXDEV;

How about only doing this checks once further up?  As the same sb also
applies the same mount.
Darrick J. Wong Feb. 28, 2024, 7:35 p.m. UTC | #2
On Wed, Feb 28, 2024 at 07:44:32AM -0800, Christoph Hellwig wrote:
> On Mon, Feb 26, 2024 at 06:21:23PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Introduce a pair of new ioctls to handle exchanging ranges of bytes
> > between files.  The goal here is to perform the exchange atomically with
> > respect to applications -- either they see the file contents before the
> > exchange or they see that A-B is now B-A, even if the kernel crashes.
> > 
> > The simpler of the two ioctls is XFS_IOC_EXCHANGE_RANGE, which performs
> > the exchange unconditionally.  XFS_IOC_COMMIT_RANGE, on the other hand,
> > requires the caller to sample the file attributes of one of the files
> > participating in the exchange, and aborts the exchange if that file has
> > changed in the meantime (presumably by another thread).
> 
> So per all the discussions, wouldn't it make sense to separate out
> XFS_IOC_COMMIT_RANGE (plus the new start commit one later), and if
> discussions are still going on just get XFS_IOC_EXCHANGE_RANGE
> done ASAP to go on with online repair, and give XFS_IOC_COMMIT_RANGE
> enough time to discuss the finer details?

Done.  All of the COMMIT_RANGE code (and the _CHECK_FRESH2 code) have
been moved to a separate patch and combined with the patch[1] from
yesterday.

[1] https://lore.kernel.org/linux-xfs/CAOQ4uxiPfno-Hx+fH3LEN_4D6HQgyMAySRNCU=O2R_-ksrxSDQ@mail.gmail.com/

> > +struct xfs_exch_range {
> > +	__s64		file1_fd;
> 
> I should have noticed this last time, by why are we passing a fd
> as a 64-bit value when it actually just is a 32-bit value in syscalls?
> (same for commit).

I'll change that.

> > +	if (((fxr->file1->f_flags | fxr->file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
> 
> Nit: overly long line here.

I'll replace the __O_SYNC | O_DSYNC with O_SYNC, since they're the same
thing.

> > +	if (fxr->flags & ~(XFS_EXCHRANGE_ALL_FLAGS | __XFS_EXCHRANGE_CHECK_FRESH2))
> 
> .. and here

Fixed, thanks.

> > +	/*
> > +	 * The ioctl enforces that src and dest files are on the same mount.
> > +	 * However, they only need to be on the same file system.
> > +	 */
> > +	if (inode1->i_sb != inode2->i_sb)
> > +		return -EXDEV;
> 
> How about only doing this checks once further up?  As the same sb also
> applies the same mount.

I'll remove this check entirely, since we've already checked that the
vfsmnt are the same.  Assuming that's what you meant-- I was slightly
confused by "same sb also applies the same mount" and decided to
interpret that as "same sb implies the same mount".

--D
Christoph Hellwig Feb. 28, 2024, 7:37 p.m. UTC | #3
On Wed, Feb 28, 2024 at 11:35:47AM -0800, Darrick J. Wong wrote:
> > How about only doing this checks once further up?  As the same sb also
> > applies the same mount.
> 
> I'll remove this check entirely, since we've already checked that the
> vfsmnt are the same.  Assuming that's what you meant-- I was slightly
> confused by "same sb also applies the same mount" and decided to
> interpret that as "same sb implies the same mount".

You interpreted the correctly.  Sorry for my jetlagged early morning
incoherence.
Darrick J. Wong Feb. 28, 2024, 11 p.m. UTC | #4
On Wed, Feb 28, 2024 at 11:37:26AM -0800, Christoph Hellwig wrote:
> On Wed, Feb 28, 2024 at 11:35:47AM -0800, Darrick J. Wong wrote:
> > > How about only doing this checks once further up?  As the same sb also
> > > applies the same mount.
> > 
> > I'll remove this check entirely, since we've already checked that the
> > vfsmnt are the same.  Assuming that's what you meant-- I was slightly
> > confused by "same sb also applies the same mount" and decided to
> > interpret that as "same sb implies the same mount".
> 
> You interpreted the correctly.  Sorry for my jetlagged early morning
> incoherence.

So it occurs to me that I've mismatched the signedness in struct
xfs_exchange_range:

struct xfs_exchange_range {
	...
	__s64		file1_offset;	/* file1 offset, bytes */
	__s64		file2_offset;	/* file2 offset, bytes */
	__u64		length;		/* bytes to exchange */

Compare this to FICLONERANGE:

struct file_clone_range {
	...
	__u64 src_offset;
	__u64 src_length;
	__u64 dest_offset;
};

The offsets and lengths for FICLONERANGE are unsigned, so I think
xfs_exchange_range ought to follow that.

--D
Christoph Hellwig Feb. 29, 2024, 1:22 p.m. UTC | #5
On Wed, Feb 28, 2024 at 03:00:57PM -0800, Darrick J. Wong wrote:
> The offsets and lengths for FICLONERANGE are unsigned, so I think
> xfs_exchange_range ought to follow that.

Yes.  I though I had actually brought that up before, but I might have
wanted to and not actually sent the comments out..
Darrick J. Wong Feb. 29, 2024, 5:10 p.m. UTC | #6
On Thu, Feb 29, 2024 at 05:22:13AM -0800, Christoph Hellwig wrote:
> On Wed, Feb 28, 2024 at 03:00:57PM -0800, Darrick J. Wong wrote:
> > The offsets and lengths for FICLONERANGE are unsigned, so I think
> > xfs_exchange_range ought to follow that.
> 
> Yes.  I though I had actually brought that up before, but I might have
> wanted to and not actually sent the comments out..

You mentioned it in passing, but I misinterpreted what you'd said and
took the signedness in the wrong direction.  Here's what I went with in
the end:

struct xfs_exchange_range {
	__s32		file1_fd;
	__u32		pad;		/* must be zeroes */
	__u64		file1_offset;	/* file1 offset, bytes */
	__u64		file2_offset;	/* file2 offset, bytes */
	__u64		length;		/* bytes to exchange */

	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
};

--D
Christoph Hellwig Feb. 29, 2024, 7:42 p.m. UTC | #7
On Thu, Feb 29, 2024 at 09:10:43AM -0800, Darrick J. Wong wrote:
> You mentioned it in passing, but I misinterpreted what you'd said and
> took the signedness in the wrong direction.  Here's what I went with in
> the end:

Looks good to me.
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 20f0bbe4b7102..3e85762a28ee7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -67,6 +67,7 @@  xfs-y				+= xfs_aops.o \
 				   xfs_dir2_readdir.o \
 				   xfs_discard.o \
 				   xfs_error.o \
+				   xfs_exchrange.o \
 				   xfs_export.o \
 				   xfs_extent_busy.o \
 				   xfs_file.o \
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ca1b17d014377..fe2bd607ac11f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -772,6 +772,76 @@  struct xfs_scrub_metadata {
 #  define XFS_XATTR_LIST_MAX 65536
 #endif
 
+/*
+ * Exchange part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2).  Filesystems must be able to
+ * restart and complete the operation even after the system goes down.
+ */
+struct xfs_exch_range {
+	__s64		file1_fd;
+	__s64		file1_offset;	/* file1 offset, bytes */
+	__s64		file2_offset;	/* file2 offset, bytes */
+	__u64		length;		/* bytes to exchange */
+
+	__u64		flags;		/* see XFS_EXCHRANGE_* below */
+
+	__u64		pad;		/* must be zeroes */
+};
+
+/*
+ * Using the same definition of file2 as struct xfs_exch_range, commit the
+ * contents of file1 into file2 if file2 has the same inode number, mtime, and
+ * ctime as the arguments provided to the call.  The old contents of file2 will
+ * be moved to file1.
+ *
+ * Returns -EBUSY if there isn't an exact match for the file2 fields.
+ *
+ * Filesystems must be able to restart and complete the operation even after
+ * the system goes down.
+ */
+struct xfs_commit_range {
+	__s64		file1_fd;
+	__s64		file1_offset;	/* file1 offset, bytes */
+	__s64		file2_offset;	/* file2 offset, bytes */
+	__s64		length;		/* bytes to exchange */
+
+	__u64		flags;		/* see XFS_EXCHRANGE_* below */
+
+	/* file2 metadata for freshness checks */
+	__u64		file2_ino;	/* inode number */
+	__s64		file2_mtime;	/* modification time */
+	__s64		file2_ctime;	/* change time */
+	__s32		file2_mtime_nsec; /* mod time, nsec */
+	__s32		file2_ctime_nsec; /* change time, nsec */
+
+	__u64		pad;		/* must be zeroes */
+};
+
+/*
+ * Exchange file data all the way to the ends of both files, and then exchange
+ * the file sizes.  This flag can be used to replace a file's contents with a
+ * different amount of data.  length will be ignored.
+ */
+#define XFS_EXCHRANGE_TO_EOF		(1ULL << 0)
+
+/* Flush all changes in file data and file metadata to disk before returning. */
+#define XFS_EXCHRANGE_DSYNC		(1ULL << 1)
+
+/* Dry run; do all the parameter verification but do not change anything. */
+#define XFS_EXCHRANGE_DRY_RUN		(1ULL << 2)
+
+/*
+ * Exchange only the parts of the two files where the file allocation units
+ * mapped to file1's range have been written to.  This can accelerate
+ * scatter-gather atomic writes with a temp file if all writes are aligned to
+ * the file allocation unit.
+ */
+#define XFS_EXCHRANGE_FILE1_WRITTEN	(1ULL << 3)
+
+#define XFS_EXCHRANGE_ALL_FLAGS		(XFS_EXCHRANGE_TO_EOF | \
+					 XFS_EXCHRANGE_DSYNC | \
+					 XFS_EXCHRANGE_DRY_RUN | \
+					 XFS_EXCHRANGE_FILE1_WRITTEN)
 
 /*
  * ioctl commands that are used by Linux filesystems
@@ -843,6 +913,8 @@  struct xfs_scrub_metadata {
 #define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
 #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_EXCHANGE_RANGE	     _IOWR('X', 129, struct xfs_exch_range)
+#define XFS_IOC_COMMIT_RANGE	     _IOWR('X', 129, struct xfs_commit_range)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
new file mode 100644
index 0000000000000..d5889db89daeb
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.c
@@ -0,0 +1,307 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_exchrange.h"
+#include <linux/fsnotify.h>
+
+/*
+ * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
+ * This part deals with struct file objects and byte ranges and does not deal
+ * with XFS-specific data structures such as xfs_inodes and block ranges.  It
+ * may some day be ported to the VFS.
+ *
+ * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
+ * file1 with the same number of bytes starting at fxr.file2_offset in file2.
+ * Implementations must call xfs_generic_exchrange_prep to prepare the two
+ * files prior to taking locks; and they must update the inode change and mod
+ * times of both files as part of the metadata update.  The timestamp update
+ * and freshness checks must be done atomically as part of the data exchange
+ * operation to ensure correctness of the freshness check.
+ * xfs_generic_exchrange_finish must be called after the operation completes
+ * successfully but before locks are dropped.
+ */
+
+/* Verify that we have security clearance to perform this operation. */
+static int
+xfs_generic_exchrange_verify_area(
+	struct xfs_exchrange	*fxr)
+{
+	int			ret;
+
+	ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
+			true);
+	if (ret)
+		return ret;
+
+	return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
+			true);
+}
+
+/*
+ * Performs necessary checks before doing a range exchange, having stabilized
+ * mutable inode attributes via i_rwsem.
+ */
+static inline int
+xfs_generic_exchrange_checks(
+	struct xfs_exchrange	*fxr,
+	unsigned int		alloc_unit)
+{
+	struct inode		*inode1 = file_inode(fxr->file1);
+	struct inode		*inode2 = file_inode(fxr->file2);
+	uint64_t		allocmask = alloc_unit - 1;
+	int64_t			test_len;
+	uint64_t		blen;
+	loff_t			size1, size2, tmp;
+	int			error;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
+		return -EPERM;
+	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
+		return -ETXTBSY;
+
+	size1 = i_size_read(inode1);
+	size2 = i_size_read(inode2);
+
+	/* Ranges cannot start after EOF. */
+	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
+		return -EINVAL;
+
+	/*
+	 * If the caller said to exchange to EOF, we set the length of the
+	 * request large enough to cover everything to the end of both files.
+	 */
+	if (fxr->flags & XFS_EXCHRANGE_TO_EOF) {
+		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
+					     size2 - fxr->file2_offset);
+
+		error = xfs_generic_exchrange_verify_area(fxr);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * The start of both ranges must be aligned to the file allocation
+	 * unit.
+	 */
+	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
+	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
+		return -EINVAL;
+
+	/* Ensure offsets don't wrap. */
+	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
+	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
+		return -EINVAL;
+
+	/*
+	 * We require both ranges to end within EOF, unless we're exchanging
+	 * to EOF.
+	 */
+	if (!(fxr->flags & XFS_EXCHRANGE_TO_EOF) &&
+	    (fxr->file1_offset + fxr->length > size1 ||
+	     fxr->file2_offset + fxr->length > size2))
+		return -EINVAL;
+
+	/*
+	 * Make sure we don't hit any file size limits.  If we hit any size
+	 * limits such that test_length was adjusted, we abort the whole
+	 * operation.
+	 */
+	test_len = fxr->length;
+	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
+			&test_len);
+	if (error)
+		return error;
+	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
+			&test_len);
+	if (error)
+		return error;
+	if (test_len != fxr->length)
+		return -EINVAL;
+
+	/*
+	 * If the user wanted us to exchange up to the infile's EOF, round up
+	 * to the next allocation unit boundary for this check.  Do the same
+	 * for the outfile.
+	 *
+	 * Otherwise, reject the range length if it's not aligned to an
+	 * allocation unit.
+	 */
+	if (fxr->file1_offset + fxr->length == size1)
+		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
+	else if (fxr->file2_offset + fxr->length == size2)
+		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
+	else if (!IS_ALIGNED(fxr->length, alloc_unit))
+		return -EINVAL;
+	else
+		blen = fxr->length;
+
+	/* Don't allow overlapped exchanges within the same file. */
+	if (inode1 == inode2 &&
+	    fxr->file2_offset + blen > fxr->file1_offset &&
+	    fxr->file1_offset + blen > fxr->file2_offset)
+		return -EINVAL;
+
+	/*
+	 * Ensure that we don't exchange a partial EOF block into the middle of
+	 * another file.
+	 */
+	if ((fxr->length & allocmask) == 0)
+		return 0;
+
+	blen = fxr->length;
+	if (fxr->file2_offset + blen < size2)
+		blen &= ~allocmask;
+
+	if (fxr->file1_offset + blen < size1)
+		blen &= ~allocmask;
+
+	return blen == fxr->length ? 0 : -EINVAL;
+}
+
+/*
+ * Check that the two inodes are eligible for range exchanges, the ranges make
+ * sense, and then flush all dirty data.  Caller must ensure that the inodes
+ * have been locked against any other modifications.
+ */
+static inline int
+xfs_generic_exchrange_prep(
+	struct xfs_exchrange	*fxr,
+	unsigned int		alloc_unit)
+{
+	struct inode		*inode1 = file_inode(fxr->file1);
+	struct inode		*inode2 = file_inode(fxr->file2);
+	bool			same_inode = (inode1 == inode2);
+	int			error;
+
+	/* Check that we don't violate system file offset limits. */
+	error = xfs_generic_exchrange_checks(fxr, alloc_unit);
+	if (error || fxr->length == 0)
+		return error;
+
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode1);
+	if (!same_inode)
+		inode_dio_wait(inode2);
+
+	error = filemap_write_and_wait_range(inode1->i_mapping,
+			fxr->file1_offset,
+			fxr->file1_offset + fxr->length - 1);
+	if (error)
+		return error;
+
+	error = filemap_write_and_wait_range(inode2->i_mapping,
+			fxr->file2_offset,
+			fxr->file2_offset + fxr->length - 1);
+	if (error)
+		return error;
+
+	/*
+	 * If the files or inodes involved require synchronous writes, amend
+	 * the request to force the filesystem to flush all data and metadata
+	 * to disk after the operation completes.
+	 */
+	if (((fxr->file1->f_flags | fxr->file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
+	    IS_SYNC(inode1) || IS_SYNC(inode2))
+		fxr->flags |= XFS_EXCHRANGE_DSYNC;
+
+	return 0;
+}
+
+/*
+ * Finish a range exchange operation, if it was successful.  Caller must ensure
+ * that the inodes are still locked against any other modifications.
+ */
+static inline int
+xfs_generic_exchrange_finish(
+	struct xfs_exchrange	*fxr)
+{
+	int			error;
+
+	error = file_remove_privs(fxr->file1);
+	if (error)
+		return error;
+	if (file_inode(fxr->file1) == file_inode(fxr->file2))
+		return 0;
+
+	return file_remove_privs(fxr->file2);
+}
+
+/* Exchange parts of two files. */
+int
+xfs_exchange_range(
+	struct xfs_exchrange	*fxr)
+{
+	struct inode		*inode1 = file_inode(fxr->file1);
+	struct inode		*inode2 = file_inode(fxr->file2);
+	int			ret;
+
+	BUILD_BUG_ON(XFS_IOC_EXCHANGE_RANGE == XFS_IOC_COMMIT_RANGE);
+	BUILD_BUG_ON(XFS_EXCHRANGE_ALL_FLAGS & XFS_EXCHRANGE_PRIVATE_FLAGS);
+
+	if (fxr->flags & ~(XFS_EXCHRANGE_ALL_FLAGS | __XFS_EXCHRANGE_CHECK_FRESH2))
+		return -EINVAL;
+
+	/*
+	 * The ioctl enforces that src and dest files are on the same mount.
+	 * However, they only need to be on the same file system.
+	 */
+	if (inode1->i_sb != inode2->i_sb)
+		return -EXDEV;
+
+	/* Userspace requests only honored for regular files. */
+	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+		return -EINVAL;
+
+	/* Both files must be opened for read and write. */
+	if (!(fxr->file1->f_mode & FMODE_READ) ||
+	    !(fxr->file1->f_mode & FMODE_WRITE) ||
+	    !(fxr->file2->f_mode & FMODE_READ) ||
+	    !(fxr->file2->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	/* Neither file can be opened append-only. */
+	if ((fxr->file1->f_flags & O_APPEND) ||
+	    (fxr->file2->f_flags & O_APPEND))
+		return -EBADF;
+
+	/*
+	 * If we're not exchanging to EOF, we can check the areas before
+	 * stabilizing both files' i_size.
+	 */
+	if (!(fxr->flags & XFS_EXCHRANGE_TO_EOF)) {
+		ret = xfs_generic_exchrange_verify_area(fxr);
+		if (ret)
+			return ret;
+	}
+
+	/* Update cmtime if the fd/inode don't forbid it. */
+	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
+		fxr->flags |= __XFS_EXCHRANGE_UPD_CMTIME1;
+	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
+		fxr->flags |= __XFS_EXCHRANGE_UPD_CMTIME2;
+
+	file_start_write(fxr->file2);
+	ret = -EOPNOTSUPP; /* XXX call out to xfs code */
+	file_end_write(fxr->file2);
+	if (ret)
+		return ret;
+
+	fsnotify_modify(fxr->file1);
+	if (fxr->file2 != fxr->file1)
+		fsnotify_modify(fxr->file2);
+	return 0;
+}
diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
new file mode 100644
index 0000000000000..593a85a644bce
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.h
@@ -0,0 +1,38 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_EXCHRANGE_H__
+#define __XFS_EXCHRANGE_H__
+
+/* Update the mtime/cmtime of file1 and file2 */
+#define __XFS_EXCHRANGE_UPD_CMTIME1	(1ULL << 63)
+#define __XFS_EXCHRANGE_UPD_CMTIME2	(1ULL << 62)
+
+/* Freshness check required */
+#define __XFS_EXCHRANGE_CHECK_FRESH2	(1ULL << 61)
+
+#define XFS_EXCHRANGE_PRIVATE_FLAGS	(__XFS_EXCHRANGE_UPD_CMTIME1 | \
+					 __XFS_EXCHRANGE_UPD_CMTIME2 | \
+					 __XFS_EXCHRANGE_CHECK_FRESH2)
+
+struct xfs_exchrange {
+	struct file		*file1;
+	struct file		*file2;
+
+	loff_t			file1_offset;
+	loff_t			file2_offset;
+	u64			length;
+
+	u64			flags;	/* XFS_EXCHRANGE flags */
+
+	/* file2 metadata for freshness checks if file1_ino != 0 */
+	u64			file2_ino;
+	struct timespec64	file2_mtime;
+	struct timespec64	file2_ctime;
+};
+
+int xfs_exchange_range(struct xfs_exchrange *fxr);
+
+#endif /* __XFS_EXCHRANGE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1360a551118dd..08fc15881ee51 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@ 
 #include "xfs_xattr.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_file.h"
+#include "xfs_exchrange.h"
 
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -1930,6 +1931,89 @@  xfs_ioctl_fs_counts(
 	return 0;
 }
 
+static long
+xfs_ioc_exchange_range(
+	struct file			*file,
+	struct xfs_exch_range __user	*argp)
+{
+	struct xfs_exchrange		fxr = {
+		.file2			= file,
+	};
+	struct xfs_exch_range		args;
+	struct fd			file1;
+	int				error;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
+		return -EINVAL;
+	if (args.flags & ~XFS_EXCHRANGE_ALL_FLAGS)
+		return -EINVAL;
+
+	fxr.file1_offset	= args.file1_offset;
+	fxr.file2_offset	= args.file2_offset;
+	fxr.length		= args.length;
+	fxr.flags		= args.flags;
+
+	file1 = fdget(args.file1_fd);
+	if (!file1.file)
+		return -EBADF;
+	fxr.file1 = file1.file;
+
+	error = -EXDEV;
+	if (fxr.file1->f_path.mnt != fxr.file2->f_path.mnt)
+		goto fdput;
+
+	error = xfs_exchange_range(&fxr);
+fdput:
+	fdput(file1);
+	return error;
+}
+
+static long
+xfs_ioc_commit_range(
+	struct file			*file,
+	struct xfs_commit_range __user	*argp)
+{
+	struct xfs_exchrange		fxr = {
+		.file2			= file,
+	};
+	struct xfs_commit_range		args;
+	struct fd			file1;
+	int				error;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
+		return -EINVAL;
+	if (args.flags & ~XFS_EXCHRANGE_ALL_FLAGS)
+		return -EINVAL;
+
+	fxr.file1_offset	= args.file1_offset;
+	fxr.file2_offset	= args.file2_offset;
+	fxr.length		= args.length;
+	fxr.flags		= args.flags | __XFS_EXCHRANGE_CHECK_FRESH2;
+	fxr.file2_ino		= args.file2_ino;
+	fxr.file2_mtime.tv_sec	= args.file2_mtime;
+	fxr.file2_mtime.tv_nsec	= args.file2_mtime_nsec;
+	fxr.file2_ctime.tv_sec	= args.file2_ctime;
+	fxr.file2_ctime.tv_nsec	= args.file2_ctime_nsec;
+
+	file1 = fdget(args.file1_fd);
+	if (!file1.file)
+		return -EBADF;
+	fxr.file1 = file1.file;
+
+	error = -EXDEV;
+	if (fxr.file1->f_path.mnt != fxr.file2->f_path.mnt)
+		goto fdput;
+
+	error = xfs_exchange_range(&fxr);
+fdput:
+	fdput(file1);
+	return error;
+}
+
 /*
  * These long-unused ioctls were removed from the official ioctl API in 5.17,
  * but retain these definitions so that we can log warnings about them.
@@ -2170,6 +2254,11 @@  xfs_file_ioctl(
 		return error;
 	}
 
+	case XFS_IOC_EXCHANGE_RANGE:
+		return xfs_ioc_exchange_range(filp, arg);
+	case XFS_IOC_COMMIT_RANGE:
+		return xfs_ioc_commit_range(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}