From patchwork Sun Dec 31 22:27:18 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507878
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 61312C12B
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:27:19 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="UbKMgPzT"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2F555C433C8;
	Sun, 31 Dec 2023 22:27:19 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061639;
	bh=5iawxK7ibeLrqBqc3iGXZu4Nl0RPkyK0C4Xb7CmzNs4=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=UbKMgPzTtz+PzsxilyRxQaDxMqgMvzQHHUTjdt/2zs/GnM4hRy1icRDnc071LJaZe
	 mmR3T2j586iM3gbbHHiTsfYAqiIEcXcX/lLkWbBFarX8zOU8ui6unNOJx3FgRQo13a
	 kZ0dqKPhpBzgLD6cZma119L+cw6wNXdiI05B6lvtK5nex1Ilpn/uq1JWmOcOSdLEB4
	 ewNVSMKA7zZU20fde7ZGP2lAT61hIz0Vq9FZJgJBrDrGtvD56z7tqWkUi232ZywX49
	 o4gH90zVXFcBd8ReyvbLz9ORjw3y27bxgzCA5IJeHOiaOWbdoCX3CxVbO1cm+TXRNi
	 k3OFe9DG7u21w==
Date: Sun, 31 Dec 2023 14:27:18 -0800
Subject: [PATCH 01/20] xfs: add a libxfs header file for staging new ioctls
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996288.1796128.1378431313126059439.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Create a new xfs_fs_staging.h header where we can land experimental
ioctls without committing them to any stable interfaces anywhere.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/libxfs.h        |    1 +
 include/xfs.h           |    1 +
 libxfs/Makefile         |    1 +
 libxfs/libxfs_priv.h    |    1 +
 libxfs/xfs_fs_staging.h |   18 ++++++++++++++++++
 5 files changed, 22 insertions(+)
 create mode 100644 libxfs/xfs_fs_staging.h

diff --git a/include/libxfs.h b/include/libxfs.h
index 16667b9d8b3..9e8596bedf9 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -27,6 +27,7 @@
 
 #include "xfs_types.h"
 #include "xfs_fs.h"
+#include "xfs_fs_staging.h"
 #include "xfs_arch.h"
 
 #include "xfs_shared.h"
diff --git a/include/xfs.h b/include/xfs.h
index e97158c8d22..c4a95bec9a9 100644
--- a/include/xfs.h
+++ b/include/xfs.h
@@ -44,5 +44,6 @@ extern int xfs_assert_largefile[sizeof(off_t)-8];
 /* Include deprecated/compat pre-vfs xfs-specific symbols */
 #include <xfs/xfs_fs_compat.h>
 #include <xfs/xfs_fs.h>
+#include <xfs/xfs_fs_staging.h>
 
 #endif	/* __XFS_H__ */
diff --git a/libxfs/Makefile b/libxfs/Makefile
index e1248c2b3ca..ed22f5c873e 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -14,6 +14,7 @@ LTLDFLAGS += -static
 
 # headers to install in include/xfs
 PKGHFILES = xfs_fs.h \
+	xfs_fs_staging.h \
 	xfs_types.h \
 	xfs_da_format.h \
 	xfs_format.h \
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index e3d9b70cc17..4d9c49091bc 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -60,6 +60,7 @@
 #include "xfs_arch.h"
 
 #include "xfs_fs.h"
+#include "xfs_fs_staging.h"
 #include "libfrog/crc32c.h"
 
 #include <sys/xattr.h>
diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
new file mode 100644
index 00000000000..d220790d5b5
--- /dev/null
+++ b/libxfs/xfs_fs_staging.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_FS_STAGING_H__
+#define __XFS_FS_STAGING_H__
+
+/*
+ * Experimental system calls, ioctls and data structures supporting them.
+ * Nothing in here should be considered part of a stable interface of any kind.
+ *
+ * If you add an ioctl here, please leave a comment in xfs_fs.h marking it
+ * reserved.  If you promote anything out of this file, please leave a comment
+ * explaining where it went.
+ */
+
+#endif /* __XFS_FS_STAGING_H__ */

From patchwork Sun Dec 31 22:27:34 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507879
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0B7EAC129
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:27:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Dw/qBhLR"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C83BBC433C8;
	Sun, 31 Dec 2023 22:27:34 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061654;
	bh=xD2jXyNVgIDCrtbkgeO7OCuY66l3drp5+AwcoaDG428=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Dw/qBhLRX70VvEEGnAmcek9iyCNO8SlPrLKcATuD3q7kAFBruDVskjHRP/M9bhP2w
	 chY3dasslccsmTqBHh+J/ZNem86yPpU2hA1EUwoZ59Vq2BwC1rgJEf/GhAxQvNQCZr
	 iWa2rL3Go7lUN8y83dUXIQkmwmgszarYxRmpc1ovr/ufEwj9qx4M+g8PtEapth9504
	 DUlo0V5AXOltg5xyfdY0BhgYi99AZMLE7wXiw9vMv1JriKpZfKHc5UzNYFJtGOY0JP
	 j08lkr0jy4AoK2y7cWpKh3oManu6y90iMvUQbW2ke8aHlZeJFk3ZkB9cC9BEiBf629
	 2sYiUUlmeDS1g==
Date: Sun, 31 Dec 2023 14:27:34 -0800
Subject: [PATCH 02/20] xfs: introduce new file range exchange ioctl
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996301.1796128.17214457854099877041.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Introduce a new ioctl to handle swapping ranges of bytes between files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_fs.h                     |    1 
 libxfs/xfs_fs_staging.h             |   89 +++++++++++
 man/man2/ioctl_xfs_exchange_range.2 |  296 +++++++++++++++++++++++++++++++++++
 3 files changed, 386 insertions(+)
 create mode 100644 man/man2/ioctl_xfs_exchange_range.2

diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index ca1b17d0143..ec92e6ded6b 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -843,6 +843,7 @@ struct xfs_scrub_metadata {
 #define XFS_IOC_FSGEOMETRY	     _IOR ('X', 126, struct xfs_fsop_geom)
 #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
+/*	XFS_IOC_EXCHANGE_RANGE -------- staging 129	 */
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
diff --git a/libxfs/xfs_fs_staging.h b/libxfs/xfs_fs_staging.h
index d220790d5b5..e3d9f3b32b0 100644
--- a/libxfs/xfs_fs_staging.h
+++ b/libxfs/xfs_fs_staging.h
@@ -15,4 +15,93 @@
  * explaining where it went.
  */
 
+/*
+ * Exchange part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2).  Filesystems must be able to
+ * restart and complete the operation even after the system goes down.
+ */
+struct xfs_exch_range {
+	__s64		file1_fd;
+	__s64		file1_offset;	/* file1 offset, bytes */
+	__s64		file2_offset;	/* file2 offset, bytes */
+	__u64		length;		/* bytes to exchange */
+
+	__u64		flags;		/* see XFS_EXCH_RANGE_* below */
+
+	/* file2 metadata for optional freshness checks */
+	__s64		file2_ino;	/* inode number */
+	__s64		file2_mtime;	/* modification time */
+	__s64		file2_ctime;	/* change time */
+	__s32		file2_mtime_nsec; /* mod time, nsec */
+	__s32		file2_ctime_nsec; /* change time, nsec */
+
+	__u64		pad[6];		/* must be zeroes */
+};
+
+/*
+ * Atomic exchange operations are not required.  This relaxes the requirement
+ * that the filesystem must be able to complete the operation after a crash.
+ */
+#define XFS_EXCH_RANGE_NONATOMIC	(1 << 0)
+
+/*
+ * Check that file2's inode number, mtime, and ctime against the values
+ * provided, and return -EBUSY if there isn't an exact match.
+ */
+#define XFS_EXCH_RANGE_FILE2_FRESH	(1 << 1)
+
+/*
+ * Check that the file1's length is equal to file1_offset + length, and that
+ * file2's length is equal to file2_offset + length.  Returns -EDOM if there
+ * isn't an exact match.
+ */
+#define XFS_EXCH_RANGE_FULL_FILES	(1 << 2)
+
+/*
+ * Exchange file data all the way to the ends of both files, and then exchange
+ * the file sizes.  This flag can be used to replace a file's contents with a
+ * different amount of data.  length will be ignored.
+ */
+#define XFS_EXCH_RANGE_TO_EOF		(1 << 3)
+
+/* Flush all changes in file data and file metadata to disk before returning. */
+#define XFS_EXCH_RANGE_FSYNC		(1 << 4)
+
+/* Dry run; do all the parameter verification but do not change anything. */
+#define XFS_EXCH_RANGE_DRY_RUN		(1 << 5)
+
+/*
+ * Exchange only the parts of the two files where the file allocation units
+ * mapped to file1's range have been written to.  This can accelerate
+ * scatter-gather atomic writes with a temp file if all writes are aligned to
+ * the file allocation unit.
+ */
+#define XFS_EXCH_RANGE_FILE1_WRITTEN	(1 << 6)
+
+/*
+ * Commit the contents of file1 into file2 if file2 has the same inode number,
+ * mtime, and ctime as the arguments provided to the call.  The old contents of
+ * file2 will be moved to file1.
+ *
+ * With this flag, all committed information can be retrieved even if the
+ * system crashes or is rebooted.  This includes writing through or flushing a
+ * disk cache if present.  The call blocks until the device reports that the
+ * commit is complete.
+ *
+ * This flag should not be combined with NONATOMIC.  It can be combined with
+ * FILE1_WRITTEN.
+ */
+#define XFS_EXCH_RANGE_COMMIT		(XFS_EXCH_RANGE_FILE2_FRESH | \
+					 XFS_EXCH_RANGE_FSYNC)
+
+#define XFS_EXCH_RANGE_ALL_FLAGS	(XFS_EXCH_RANGE_NONATOMIC | \
+					 XFS_EXCH_RANGE_FILE2_FRESH | \
+					 XFS_EXCH_RANGE_FULL_FILES | \
+					 XFS_EXCH_RANGE_TO_EOF | \
+					 XFS_EXCH_RANGE_FSYNC | \
+					 XFS_EXCH_RANGE_DRY_RUN | \
+					 XFS_EXCH_RANGE_FILE1_WRITTEN)
+
+#define XFS_IOC_EXCHANGE_RANGE	_IOWR('X', 129, struct xfs_exch_range)
+
 #endif /* __XFS_FS_STAGING_H__ */
diff --git a/man/man2/ioctl_xfs_exchange_range.2 b/man/man2/ioctl_xfs_exchange_range.2
new file mode 100644
index 00000000000..a292d8e9641
--- /dev/null
+++ b/man/man2/ioctl_xfs_exchange_range.2
@@ -0,0 +1,296 @@
+.\" Copyright (c) 2020-2024 Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" This is free documentation; you can redistribute it and/or
+.\" modify it under the terms of the GNU General Public License as
+.\" published by the Free Software Foundation; either version 2 of
+.\" the License, or (at your option) any later version.
+.\"
+.\" The GNU General Public License's references to "object code"
+.\" and "executables" are to be interpreted as the output of any
+.\" document formatting or typesetting system, including
+.\" intermediate and printed output.
+.\"
+.\" This manual is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public
+.\" License along with this manual; if not, see
+.\" <http://www.gnu.org/licenses/>.
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-EXCHANGE-RANGE 2  2023-05-08 "XFS"
+.SH NAME
+ioctl_xfs_exchange_range \- exchange the contents of parts of two files
+.SH SYNOPSIS
+.br
+.B #include <sys/ioctl.h>
+.br
+.B #include <xfs/xfs_fs_staging.h>
+.PP
+.BI "int ioctl(int " file2_fd ", XFS_IOC_EXCHANGE_RANGE, struct xfs_exch_range *" arg );
+.SH DESCRIPTION
+Given a range of bytes in a first file
+.B file1_fd
+and a second range of bytes in a second file
+.BR file2_fd ,
+this
+.BR ioctl (2)
+exchanges the contents of the two ranges.
+.PP
+Exchanges are atomic with regards to concurrent file operations, so no
+userspace-level locks need to be taken to obtain consistent results.
+Implementations must guarantee that readers see either the old contents or the
+new contents in their entirety, even if the system fails.
+.PP
+The exchange parameters are conveyed in a structure of the following form:
+.PP
+.in +4n
+.EX
+struct xfs_exch_range {
+    __s64    file1_fd;
+    __s64    file1_offset;
+    __s64    file2_offset;
+    __s64    length;
+
+    __u64    flags;
+
+    __s64    file2_ino;
+    __s64    file2_mtime;
+    __s64    file2_ctime;
+    __s32    file2_mtime_nsec;
+    __s32    file2_ctime_nsec;
+
+    __u64    pad[6];
+};
+.EE
+.in
+.PP
+The field
+.I pad
+must be zero.
+.PP
+The fields
+.IR file1_fd ", " file1_offset ", and " length
+define the first range of bytes to be exchanged.
+.PP
+The fields
+.IR file2_fd ", " file2_offset ", and " length
+define the second range of bytes to be exchanged.
+.PP
+Both files must be from the same filesystem mount.
+If the two file descriptors represent the same file, the byte ranges must not
+overlap.
+Most disk-based filesystems require that the starts of both ranges must be
+aligned to the file block size.
+If this is the case, the ends of the ranges must also be so aligned unless the
+.B XFS_EXCH_RANGE_TO_EOF
+flag is set.
+
+.PP
+The field
+.I flags
+control the behavior of the exchange operation.
+.RS 0.4i
+.TP
+.B XFS_EXCH_RANGE_FILE2_FRESH
+Check the freshness of
+.I file2_fd
+after locking the file but before exchanging the contents.
+The supplied
+.IR file2_ino " field"
+must match file2's inode number, and the supplied
+.IR file2_mtime ", " file2_mtime_nsec ", " file2_ctime ", and " file2_ctime_nsec
+fields must match the modification time and change time of file2.
+If they do not match,
+.B EBUSY
+will be returned.
+.TP
+.B XFS_EXCH_RANGE_TO_EOF
+Ignore the
+.I length
+parameter.
+All bytes in
+.I file1_fd
+from
+.I file1_offset
+to EOF are moved to
+.IR file2_fd ,
+and file2's size is set to
+.RI ( file2_offset "+(" file1_length - file1_offset )).
+Meanwhile, all bytes in file2 from
+.I file2_offset
+to EOF are moved to file1 and file1's size is set to
+.RI ( file1_offset "+(" file2_length - file2_offset )).
+This option is not compatible with
+.BR XFS_EXCH_RANGE_FULL_FILES .
+.TP
+.B XFS_EXCH_RANGE_FSYNC
+Ensure that all modified in-core data in both file ranges and all metadata
+updates pertaining to the exchange operation are flushed to persistent storage
+before the call returns.
+Opening either file descriptor with
+.BR O_SYNC " or " O_DSYNC
+will have the same effect.
+.TP
+.B XFS_EXCH_RANGE_FILE1_WRITTEN
+Only exchange sub-ranges of
+.I file1_fd
+that are known to contain data written by application software.
+Each sub-range may be expanded (both upwards and downwards) to align with the
+file allocation unit.
+For files on the data device, this is one filesystem block.
+For files on the realtime device, this is the realtime extent size.
+This facility can be used to implement fast atomic scatter-gather writes of any
+complexity for software-defined storage targets if all writes are aligned to
+the file allocation unit.
+.TP
+.B XFS_EXCH_RANGE_DRY_RUN
+Check the parameters and the feasibility of the operation, but do not change
+anything.
+.TP
+.B XFS_EXCH_RANGE_COMMIT
+This flag is a combination of
+.BR XFS_EXCH_RANGE_FILE2_FRESH " | " XFS_EXCH_RANGE_FSYNC
+and can be used to commit changes to
+.I file2_fd
+to persistent storage if and only if file2 has not changed.
+.TP
+.B XFS_EXCH_RANGE_FULL_FILES
+Require that
+.IR file1_offset " and " file2_offset
+are zero, and that the
+.I length
+field matches the lengths of both files.
+If not,
+.B EDOM
+will be returned.
+This option is not compatible with
+.BR XFS_EXCH_RANGE_TO_EOF .
+.TP
+.B XFS_EXCH_RANGE_NONATOMIC
+This flag relaxes the requirement that readers see only the old contents or
+the new contents in their entirety.
+If the system fails before all modified in-core data and metadata updates
+are persisted to disk, the contents of both file ranges after recovery are not
+defined and may be a mix of both.
+
+Do not use this flag unless the contents of both ranges are known to be
+identical and there are no other writers.
+.RE
+.PP
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EBADF
+.IR file1_fd
+is not open for reading and writing or is open for append-only writes; or
+.IR file2_fd
+is not open for reading and writing or is open for append-only writes.
+.TP
+.B EBUSY
+The inode number and timestamps supplied do not match
+.IR file2_fd
+and
+.B XFS_EXCH_RANGE_FILE2_FRESH
+was set in
+.IR flags .
+.TP
+.B EDOM
+The ranges do not cover the entirety of both files, and
+.B XFS_EXCH_RANGE_FULL_FILES
+was set in
+.IR flags .
+.TP
+.B EINVAL
+The parameters are not correct for these files.
+This error can also appear if either file descriptor represents
+a device, FIFO, or socket.
+Disk filesystems generally require the offset and length arguments
+to be aligned to the fundamental block sizes of both files.
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B EISDIR
+One of the files is a directory.
+.TP
+.B ENOMEM
+The kernel was unable to allocate sufficient memory to perform the
+operation.
+.TP
+.B ENOSPC
+There is not enough free space in the filesystem exchange the contents safely.
+.TP
+.B EOPNOTSUPP
+The filesystem does not support exchanging bytes between the two
+files.
+.TP
+.B EPERM
+.IR file1_fd " or " file2_fd
+are immutable.
+.TP
+.B ETXTBSY
+One of the files is a swap file.
+.TP
+.B EUCLEAN
+The filesystem is corrupt.
+.TP
+.B EXDEV
+.IR file1_fd " and " file2_fd
+are not on the same mounted filesystem.
+.SH CONFORMING TO
+This API is XFS-specific.
+.SH USE CASES
+.PP
+Three use cases are imagined for this system call.
+.PP
+The first is a filesystem defragmenter, which copies the contents of a file
+into another file and wishes to exchange the space mappings of the two files,
+provided that the original file has not changed.  The flags
+.BR NONATOMIC " and " FILE2_FRESH
+are recommended for this application.
+.PP
+The second is a data storage program that wants to commit non-contiguous updates
+to a file atomically.  This can be done by creating a temporary file, calling
+.BR FICLONE (2)
+to share the contents, and staging the updates into the temporary file.
+Either of the
+.BR FULL_FILES " or " TO_EOF
+flags are recommended, along with
+.BR FSYNC .
+Depending on the application's locking design, the flags
+.BR FILE2_FRESH " or " COMMIT
+may be applicable here.
+The temporary file can be deleted or punched out afterwards.
+.PP
+The third is a software-defined storage host (e.g. a disk jukebox) which
+implements an atomic scatter-gather write command.
+Provided the exported disk's logical block size matches the file's allocation
+unit size, this can be done by creating a temporary file and writing the data
+at the appropriate offsets.
+It is recommended that the temporary file be truncated to the size of the
+regular file before any writes are staged to the temporary file to avoid issues
+with zeroing during EOF extension.
+Use this call with the
+.B FILE1_WRITTEN
+flag to exchange only the file allocation units involved in the emulated
+device's write command.
+The use of the
+.B FSYNC
+flag is recommended here.
+The temporary file should be deleted or punched out completely before being
+reused to stage another write.
+.B
+.SH NOTES
+.PP
+Some filesystems may limit the amount of data or the number of extents that can
+be exchanged in a single call.
+.SH SEE ALSO
+.BR ioctl (2)

From patchwork Sun Dec 31 22:27:50 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507880
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A9AD1C12D
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:27:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="FSUG9rpZ"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 78D2BC433C8;
	Sun, 31 Dec 2023 22:27:50 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061670;
	bh=km9QJpkM0NpW/whmktokrqvVsUgm5KWaZ3UNuV4WyBM=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=FSUG9rpZESnmll9ZtBlxiBXU/fLfZLIyfnZpGKNsAnG425aua3C09Cc1XPU/GqzBW
	 MeSCAwRVtPeXpfsK0ScozpmattTlbrpr/7+8JsI0fZWa5PfcCrY5+5BlH2yn4AGXRv
	 WYUAT1VT6YrnyZOO5pFFjD1i/ZbehbbvycySQ3kJhadZyg+ALHyJRiI3s6d4se/b3E
	 dQxvD6AqRftTrS1CqN70kvrtktLTLAXUVnb/fOnQZZnIYtbz/syEKSpWFoaJxtLTJ0
	 9krnZaGRlwUuT8s3aSRjmu9P7FbwdUi/5+VUS2bHl+yu8Nt//U/wasAvVMqKnknq8H
	 dAdvFv35xHNvQ==
Date: Sun, 31 Dec 2023 14:27:50 -0800
Subject: [PATCH 03/20] xfs: parameterize all the incompat log feature helpers
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996314.1796128.156094362141521193.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

We're about to define a new XFS_SB_FEAT_INCOMPAT_LOG_ bit, which means
that callers will soon require the ability to toggle on and off
different log incompat feature bits.  Parameterize the
xlog_{use,drop}_incompat_feat and xfs_sb_remove_incompat_log_features
functions so that callers can specify which feature they're trying to
use and so that we can clear individual log incompat bits as needed.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index e6ca188e227..4baafff6197 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -404,9 +404,10 @@ xfs_sb_has_incompat_log_feature(
 
 static inline void
 xfs_sb_remove_incompat_log_features(
-	struct xfs_sb	*sbp)
+	struct xfs_sb	*sbp,
+	uint32_t	feature)
 {
-	sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
+	sbp->sb_features_log_incompat &= ~feature;
 }
 
 static inline void

From patchwork Sun Dec 31 22:28:05 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507881
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4684FC127
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:28:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="TKXFqGTe"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 15C66C433C8;
	Sun, 31 Dec 2023 22:28:06 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061686;
	bh=sglLl4PYk/PznthEWH+3xa7W3WeyMiMjN+BQ458OUtA=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=TKXFqGTeqXbS9chkD0zMw7maPi7N+21W4lTpBqDYJd1ueq7UHSxSYzuI/Ux50Flpo
	 2cd966s4P7D+ur/r7/d21w6RGYOipjkQbd0oklcQuOimPz/JOheGG/h4LSvGpTaKRp
	 SwdoAmHVRwc+3SzI/8Cu3JsjV8wmcqZBKB7lvUCNAle3f82X//jBOrllKUY4u4iWdZ
	 bP/uzJV9pvADypVSxQCjwijn6vYwG/H99vCniZDA8NZICEqoZQ2bLTz802PKThQYbk
	 gc9cEjmoQK8D8aua4ItsXi1p0wmzFP0gGOJvjG+aMm2Q8XqQH7LeSKAOcczFTHEyVD
	 onL00ayxcV1Aw==
Date: Sun, 31 Dec 2023 14:28:05 -0800
Subject: [PATCH 04/20] xfs: create a log incompat flag for atomic extent
 swapping
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996327.1796128.7646738653562611664.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Create a log incompat flag so that we only attempt to process swap
extent log items if the filesystem supports it, and a geometry flag to
advertise support if it's present.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h             |    6 +++
 libxfs/xfs_fs.h                 |    3 ++
 libxfs/xfs_sb.c                 |    3 ++
 libxfs/xfs_swapext.h            |   75 +++++++++++++++++++++++++++++++++++++++
 man/man2/ioctl_xfs_fsgeometry.2 |    3 ++
 5 files changed, 90 insertions(+)
 create mode 100644 libxfs/xfs_swapext.h

diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 4baafff6197..c0209bd21db 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -391,6 +391,12 @@ xfs_sb_has_incompat_feature(
 }
 
 #define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS   (1 << 0)	/* Delayed Attributes */
+
+/*
+ * Log contains SXI log intent items which are not otherwise protected by
+ * an INCOMPAT/RO_COMPAT feature flag.
+ */
+#define XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT  (1U << 31)
 #define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
 	(XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
 #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_LOG_ALL
diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h
index ec92e6ded6b..63a145e5035 100644
--- a/libxfs/xfs_fs.h
+++ b/libxfs/xfs_fs.h
@@ -240,6 +240,9 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_INOBTCNT	(1 << 22) /* inobt btree counter */
 #define XFS_FSOP_GEOM_FLAGS_NREXT64	(1 << 23) /* large extent counters */
 
+/* atomic file extent swap available to userspace */
+#define XFS_FSOP_GEOM_FLAGS_ATOMIC_SWAP	(1U << 31)
+
 /*
  * Minimum and maximum sizes need for growth checks.
  *
diff --git a/libxfs/xfs_sb.c b/libxfs/xfs_sb.c
index 30a6bc07d88..fd017d18cda 100644
--- a/libxfs/xfs_sb.c
+++ b/libxfs/xfs_sb.c
@@ -24,6 +24,7 @@
 #include "xfs_health.h"
 #include "xfs_ag.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_swapext.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -1256,6 +1257,8 @@ xfs_fs_geometry(
 	}
 	if (xfs_has_large_extent_counts(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
+	if (xfs_atomic_swap_supported(mp))
+		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATOMIC_SWAP;
 	geo->rtsectsize = sbp->sb_blocksize;
 	geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
 
diff --git a/libxfs/xfs_swapext.h b/libxfs/xfs_swapext.h
new file mode 100644
index 00000000000..01bb3271f64
--- /dev/null
+++ b/libxfs/xfs_swapext.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SWAPEXT_H_
+#define __XFS_SWAPEXT_H_ 1
+
+/*
+ * Decide if this filesystem supports the minimum feature set required to use
+ * the swapext iteration code in non-atomic swap mode.  This mode uses the
+ * BUI log items introduced for the rmapbt and reflink features, but does not
+ * use swapext log items to track progress over a file range.
+ */
+static inline bool
+xfs_swapext_supports_nonatomic(
+	struct xfs_mount	*mp)
+{
+	return xfs_has_reflink(mp) || xfs_has_rmapbt(mp);
+}
+
+/*
+ * Decide if this filesystem has a new enough permanent feature set to protect
+ * swapext log items from being replayed on a kernel that does not have
+ * XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT set.
+ */
+static inline bool
+xfs_swapext_can_use_without_log_assistance(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_sb_is_v5(&mp->m_sb))
+		return false;
+
+	if (xfs_sb_has_incompat_feature(&mp->m_sb,
+				~(XFS_SB_FEAT_INCOMPAT_FTYPE |
+				  XFS_SB_FEAT_INCOMPAT_SPINODES |
+				  XFS_SB_FEAT_INCOMPAT_META_UUID |
+				  XFS_SB_FEAT_INCOMPAT_BIGTIME |
+				  XFS_SB_FEAT_INCOMPAT_NREXT64)))
+		return true;
+
+	return false;
+}
+
+/*
+ * Decide if atomic extent swapping could be used on this filesystem.  This
+ * does not say anything about the filesystem's readiness to do that.
+ */
+static inline bool
+xfs_atomic_swap_supported(
+	struct xfs_mount	*mp)
+{
+	/*
+	 * In theory, we could support atomic extent swapping by setting
+	 * XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT on any filesystem and that would be
+	 * sufficient to protect the swapext log items that would be created.
+	 * However, we don't want to enable new features on a really old
+	 * filesystem, so we'll only advertise atomic swap support on the ones
+	 * that support BUI log items.
+	 */
+	if (xfs_swapext_supports_nonatomic(mp))
+		return true;
+
+	/*
+	 * If the filesystem has an RO_COMPAT or INCOMPAT bit that we don't
+	 * recognize, then it's new enough not to need INCOMPAT_LOG_SWAPEXT
+	 * to protect swapext log items.
+	 */
+	if (xfs_swapext_can_use_without_log_assistance(mp))
+		return true;
+
+	return false;
+}
+
+#endif /* __XFS_SWAPEXT_H_ */
diff --git a/man/man2/ioctl_xfs_fsgeometry.2 b/man/man2/ioctl_xfs_fsgeometry.2
index f59a6e8a6a2..4c7ff9a270b 100644
--- a/man/man2/ioctl_xfs_fsgeometry.2
+++ b/man/man2/ioctl_xfs_fsgeometry.2
@@ -211,6 +211,9 @@ Filesystem stores reverse mappings of blocks to owners.
 .TP
 .B XFS_FSOP_GEOM_FLAGS_REFLINK
 Filesystem supports sharing blocks between files.
+.TP
+.B XFS_FSOP_GEOM_FLAGS_ATOMICSWAP
+Filesystem can exchange file contents atomically via XFS_IOC_EXCHANGE_RANGE.
 .RE
 .SH XFS METADATA HEALTH REPORTING
 .PP

From patchwork Sun Dec 31 22:28:21 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507882
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D71D9BE67
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:28:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="MeYx//i1"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id A8343C433C8;
	Sun, 31 Dec 2023 22:28:21 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061701;
	bh=kb+SRakTp6BmZpP2l7drmzh2NxPB2mTCz7lovIoKHdI=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=MeYx//i18ENTlAkKyBsbA/bIfNXzi1p/yCHRzGAbAy+MG2q6jIBFo8b8qtWhquxz5
	 LpUxVl2gMLKfXHZiPA/m8hWfYvnxUlczbNq0pgIfxS+nGXikfDO/Cccmh2H+2NIGl5
	 oJwMBufEL4L9kqAxRz6IZaHICCLM7Vqx5IZSVngE0paSvWxB8XIqLSt9ecVKejL+RB
	 F3fIlDwxWlprMXTjpO29l3h9obP3zsTi/OHD8/cMlRCvsuNVPhyCofLOuBsZgSDXOk
	 i7pNZBw95WXn4W5S5njHDqu1VHF+h5qam+SyEOeFlw3vgLP44Wcv1qBYmnstpAm7YJ
	 YpxQNPFK5QWWA==
Date: Sun, 31 Dec 2023 14:28:21 -0800
Subject: [PATCH 05/20] xfs: introduce a swap-extent log intent item
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996341.1796128.15576953269608510832.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Introduce a new intent log item to handle swapping extents.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_log_format.h |   51 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index 16872972e1e..24c3d5dc361 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -117,8 +117,9 @@ struct xfs_unmount_log_format {
 #define XLOG_REG_TYPE_ATTRD_FORMAT	28
 #define XLOG_REG_TYPE_ATTR_NAME	29
 #define XLOG_REG_TYPE_ATTR_VALUE	30
-#define XLOG_REG_TYPE_MAX		30
-
+#define XLOG_REG_TYPE_SXI_FORMAT	31
+#define XLOG_REG_TYPE_SXD_FORMAT	32
+#define XLOG_REG_TYPE_MAX		32
 
 /*
  * Flags to log operation header
@@ -243,6 +244,8 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_BUD		0x1245
 #define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
 #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
+#define	XFS_LI_SXI		0x1248  /* extent swap intent */
+#define	XFS_LI_SXD		0x1249  /* extent swap done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -260,7 +263,9 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
 	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
 	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
-	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
+	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
+	{ XFS_LI_SXI,		"XFS_LI_SXI" }, \
+	{ XFS_LI_SXD,		"XFS_LI_SXD" }
 
 /*
  * Inode Log Item Format definitions.
@@ -878,6 +883,46 @@ struct xfs_bud_log_format {
 	uint64_t		bud_bui_id;	/* id of corresponding bui */
 };
 
+/*
+ * SXI/SXD (extent swapping) log format definitions
+ */
+
+struct xfs_swap_extent {
+	uint64_t		sx_inode1;
+	uint64_t		sx_inode2;
+	uint64_t		sx_startoff1;
+	uint64_t		sx_startoff2;
+	uint64_t		sx_blockcount;
+	uint64_t		sx_flags;
+	int64_t			sx_isize1;
+	int64_t			sx_isize2;
+};
+
+#define XFS_SWAP_EXT_FLAGS		(0)
+
+#define XFS_SWAP_EXT_STRINGS
+
+/* This is the structure used to lay out an sxi log item in the log. */
+struct xfs_sxi_log_format {
+	uint16_t		sxi_type;	/* sxi log item type */
+	uint16_t		sxi_size;	/* size of this item */
+	uint32_t		__pad;		/* must be zero */
+	uint64_t		sxi_id;		/* sxi identifier */
+	struct xfs_swap_extent	sxi_extent;	/* extent to swap */
+};
+
+/*
+ * This is the structure used to lay out an sxd log item in the
+ * log.  The sxd_extents array is a variable size array whose
+ * size is given by sxd_nextents;
+ */
+struct xfs_sxd_log_format {
+	uint16_t		sxd_type;	/* sxd log item type */
+	uint16_t		sxd_size;	/* size of this item */
+	uint32_t		__pad;
+	uint64_t		sxd_sxi_id;	/* id of corresponding bui */
+};
+
 /*
  * Dquot Log format definitions.
  *

From patchwork Sun Dec 31 22:28:36 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507883
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E159BC129
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:28:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="FGHtX/XE"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4A772C433C7;
	Sun, 31 Dec 2023 22:28:37 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061717;
	bh=OXyEpz0e6IodfgWQq1TpeCc0K2jQ0X9BfftbGtSikMQ=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=FGHtX/XEJgOD1xPUOr5pDhBm84ehGql9PEHoPzfdOq/sVp150T9K5nWCn5yU+YFqS
	 i6KhbqB/b5AQ1F3sV4KajOusuXik330488Jnhb+cfJz6KPA96ZrCNPro3q7n46mMVl
	 91Xvcz04yevvh0ay+0dTl3XWKWO7ZR5Zrjv9oy0kFJlGmB4o4kNUgFpfOdbC+AVmEJ
	 EiFiLKIT3ppXWAKnaff8xJAugWj/b+0ozNm0tbLDsRkTN+rgiEIUe7QtdngvEeR8Id
	 j7CbrmRZKLDmR06CXs7FrjWMzZcTXUtBRlYaxrNJCJzrJiilL6hPjWP/YjaHItF9FO
	 QYeO2CXlfu6bw==
Date: Sun, 31 Dec 2023 14:28:36 -0800
Subject: [PATCH 06/20] xfs: create deferred log items for extent swapping
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996354.1796128.1659527344636167484.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Now that we've created the skeleton of a log intent item to track and
restart extent swap operations, add the upper level logic to commit
intent items and turn them into concrete work recorded in the log.  We
use the deferred item "multihop" feature that was introduced a few
patches ago to constrain the number of active swap operations to one per
thread.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_trace.h      |   14 +
 libxfs/Makefile          |    2 
 libxfs/defer_item.c      |   91 ++++
 libxfs/defer_item.h      |    4 
 libxfs/libxfs_priv.h     |   30 +
 libxfs/xfs_bmap.h        |    2 
 libxfs/xfs_defer.c       |    6 
 libxfs/xfs_defer.h       |    2 
 libxfs/xfs_format.h      |    6 
 libxfs/xfs_log_format.h  |   31 +
 libxfs/xfs_swapext.c     | 1028 ++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_swapext.h     |  143 ++++++
 libxfs/xfs_trans_space.h |    4 
 13 files changed, 1359 insertions(+), 4 deletions(-)
 create mode 100644 libxfs/xfs_swapext.c

diff --git a/include/xfs_trace.h b/include/xfs_trace.h
index 57661f36d7c..a8f3ecac7f6 100644
--- a/include/xfs_trace.h
+++ b/include/xfs_trace.h
@@ -329,6 +329,9 @@
 #define trace_xfs_refcount_cow_decrease(...)	((void) 0)
 #define trace_xfs_refcount_recover_extent(...)	((void) 0)
 
+#define trace_xfs_reflink_set_inode_flag(...)	((void) 0)
+#define trace_xfs_reflink_unset_inode_flag(...)	((void) 0)
+
 #define trace_xfs_rmap_find_left_neighbor_candidate(...)	((void) 0)
 #define trace_xfs_rmap_find_left_neighbor_query(...)	((void) 0)
 #define trace_xfs_rmap_find_left_neighbor_result(...)	((void) 0)
@@ -342,6 +345,17 @@
 #define trace_xfs_rmap_map_error(...)		((void) 0)
 #define trace_xfs_rmap_delete_error(...)	((void) 0)
 
+#define trace_xfs_swapext_defer(...)		((void) 0)
+#define trace_xfs_swapext_delta_nextents(...)	((void) 0)
+#define trace_xfs_swapext_delta_nextents_step(...)	((void) 0)
+#define trace_xfs_swapext_extent1_skip(...)	((void) 0)
+#define trace_xfs_swapext_extent1(...)		((void) 0)
+#define trace_xfs_swapext_extent2(...)		((void) 0)
+#define trace_xfs_swapext_final_estimate(...)	((void) 0)
+#define trace_xfs_swapext_initial_estimate(...)	((void) 0)
+#define trace_xfs_swapext_overhead(...)		((void) 0)
+#define trace_xfs_swapext_update_inode_size(...) ((void) 0)
+
 #define trace_xfs_fs_mark_healthy(a,b)		((void) 0)
 
 #define trace_xlog_intent_recovery_failed(...)	((void) 0)
diff --git a/libxfs/Makefile b/libxfs/Makefile
index ed22f5c873e..0fb8f7b39bc 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -58,6 +58,7 @@ HFILES = \
 	xfs_rtbitmap.h \
 	xfs_sb.h \
 	xfs_shared.h \
+	xfs_swapext.h \
 	xfs_trans_resv.h \
 	xfs_trans_space.h \
 	xfs_dir2_priv.h
@@ -106,6 +107,7 @@ CFILES = cache.c \
 	xfs_rmap_btree.c \
 	xfs_rtbitmap.c \
 	xfs_sb.c \
+	xfs_swapext.c \
 	xfs_symlink_remote.c \
 	xfs_trans_inode.c \
 	xfs_trans_resv.c \
diff --git a/libxfs/defer_item.c b/libxfs/defer_item.c
index e7d64be014d..49e6cf02dc8 100644
--- a/libxfs/defer_item.c
+++ b/libxfs/defer_item.c
@@ -25,6 +25,8 @@
 #include "xfs_attr.h"
 #include "libxfs.h"
 #include "defer_item.h"
+#include "xfs_ag.h"
+#include "xfs_swapext.h"
 
 /* Dummy defer item ops, since we don't do logging. */
 
@@ -677,3 +679,92 @@ const struct xfs_defer_op_type xfs_attr_defer_type = {
 	.finish_item	= xfs_attr_finish_item,
 	.cancel_item	= xfs_attr_cancel_item,
 };
+
+/* Atomic Swapping of File Ranges */
+
+STATIC struct xfs_log_item *
+xfs_swapext_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	return NULL;
+}
+STATIC struct xfs_log_item *
+xfs_swapext_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	return NULL;
+}
+
+/* Add this deferred SXI to the transaction. */
+void
+xfs_swapext_defer_add(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	trace_xfs_swapext_defer(tp->t_mountp, sxi);
+
+	xfs_defer_add(tp, &sxi->sxi_list, &xfs_swapext_defer_type);
+}
+
+static inline struct xfs_swapext_intent *sxi_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_swapext_intent, sxi_list);
+}
+
+/* Process a deferred swapext update. */
+STATIC int
+xfs_swapext_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_swapext_intent	*sxi = sxi_entry(item);
+	int				error;
+
+	/*
+	 * Swap one more extent between the two files.  If there's still more
+	 * work to do, we want to requeue ourselves after all other pending
+	 * deferred operations have finished.  This includes all of the dfops
+	 * that we queued directly as well as any new ones created in the
+	 * process of finishing the others.  Doing so prevents us from queuing
+	 * a large number of SXI log items in kernel memory, which in turn
+	 * prevents us from pinning the tail of the log (while logging those
+	 * new SXI items) until the first SXI items can be processed.
+	 */
+	error = xfs_swapext_finish_one(tp, sxi);
+	if (error != -EAGAIN)
+		kmem_cache_free(xfs_swapext_intent_cache, sxi);
+	return error;
+}
+
+/* Abort all pending SXIs. */
+STATIC void
+xfs_swapext_abort_intent(
+	struct xfs_log_item		*intent)
+{
+}
+
+/* Cancel a deferred swapext update. */
+STATIC void
+xfs_swapext_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_swapext_intent	*sxi = sxi_entry(item);
+
+	kmem_cache_free(xfs_swapext_intent_cache, sxi);
+}
+
+const struct xfs_defer_op_type xfs_swapext_defer_type = {
+	.name		= "swapext",
+	.create_intent	= xfs_swapext_create_intent,
+	.abort_intent	= xfs_swapext_abort_intent,
+	.create_done	= xfs_swapext_create_done,
+	.finish_item	= xfs_swapext_finish_item,
+	.cancel_item	= xfs_swapext_cancel_item,
+};
diff --git a/libxfs/defer_item.h b/libxfs/defer_item.h
index 6d3abf1589c..a3ef9e079d0 100644
--- a/libxfs/defer_item.h
+++ b/libxfs/defer_item.h
@@ -10,4 +10,8 @@ struct xfs_bmap_intent;
 
 void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
 
+struct xfs_swapext_intent;
+
+void xfs_swapext_defer_add(struct xfs_trans *tp, struct xfs_swapext_intent *sxi);
+
 #endif /* __LIBXFS_DEFER_ITEM_H_ */
diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h
index 4d9c49091bc..ef29d7e5eb7 100644
--- a/libxfs/libxfs_priv.h
+++ b/libxfs/libxfs_priv.h
@@ -220,6 +220,35 @@ static inline bool WARN_ON(bool expr) {
 	(inode)->i_version = (version);	\
 } while (0)
 
+#define __must_check	__attribute__((__warn_unused_result__))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+	return unlikely(overflow);
+}
+
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({	\
+	typeof(a) __a = (a);			\
+	typeof(b) __b = (b);			\
+	typeof(d) __d = (d);			\
+	(void) (&__a == &__b);			\
+	(void) (&__a == __d);			\
+	__builtin_add_overflow(__a, __b, __d);	\
+}))
+
 #define min_t(type,x,y) \
 	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #define max_t(type,x,y) \
@@ -535,6 +564,7 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *lip, int type,
 #define xfs_log_in_recovery(mp)		(false)
 
 /* xfs_icache.c */
+#define xfs_inode_clear_cowblocks_tag(ip)	do { } while (0)
 #define xfs_inode_set_cowblocks_tag(ip)	do { } while (0)
 #define xfs_inode_set_eofblocks_tag(ip)	do { } while (0)
 
diff --git a/libxfs/xfs_bmap.h b/libxfs/xfs_bmap.h
index 1eee606f392..ccd1ddcd785 100644
--- a/libxfs/xfs_bmap.h
+++ b/libxfs/xfs_bmap.h
@@ -156,7 +156,7 @@ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
  * Return true if the extent is a real, allocated extent, or false if it is  a
  * delayed allocation, and unwritten extent or a hole.
  */
-static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec)
 {
 	return xfs_bmap_is_real_extent(irec) &&
 	       irec->br_state != XFS_EXT_UNWRITTEN;
diff --git a/libxfs/xfs_defer.c b/libxfs/xfs_defer.c
index 077e9929807..7782eea458e 100644
--- a/libxfs/xfs_defer.c
+++ b/libxfs/xfs_defer.c
@@ -21,6 +21,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr.h"
+#include "xfs_swapext.h"
 
 static struct kmem_cache	*xfs_defer_pending_cache;
 
@@ -1174,6 +1175,10 @@ xfs_defer_init_item_caches(void)
 	error = xfs_attr_intent_init_cache();
 	if (error)
 		goto err;
+	error = xfs_swapext_intent_init_cache();
+	if (error)
+		goto err;
+
 	return 0;
 err:
 	xfs_defer_destroy_item_caches();
@@ -1184,6 +1189,7 @@ xfs_defer_init_item_caches(void)
 void
 xfs_defer_destroy_item_caches(void)
 {
+	xfs_swapext_intent_destroy_cache();
 	xfs_attr_intent_destroy_cache();
 	xfs_extfree_intent_destroy_cache();
 	xfs_bmap_intent_destroy_cache();
diff --git a/libxfs/xfs_defer.h b/libxfs/xfs_defer.h
index 18a9fb92dde..e3cf81bafca 100644
--- a/libxfs/xfs_defer.h
+++ b/libxfs/xfs_defer.h
@@ -72,7 +72,7 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
 extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
 extern const struct xfs_defer_op_type xfs_attr_defer_type;
-
+extern const struct xfs_defer_op_type xfs_swapext_defer_type;
 
 /*
  * Deferred operation item relogging limits.
diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index c0209bd21db..8b34754a579 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -430,6 +430,12 @@ static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
 		 XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
 }
 
+static inline bool xfs_sb_version_haslogswapext(struct xfs_sb *sbp)
+{
+	return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+		 XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
+}
+
 static inline bool
 xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
 {
diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index 24c3d5dc361..3341792cf43 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -898,9 +898,36 @@ struct xfs_swap_extent {
 	int64_t			sx_isize2;
 };
 
-#define XFS_SWAP_EXT_FLAGS		(0)
+/* Swap extents between extended attribute forks. */
+#define XFS_SWAP_EXT_ATTR_FORK		(1ULL << 0)
 
-#define XFS_SWAP_EXT_STRINGS
+/* Set the file sizes when finished. */
+#define XFS_SWAP_EXT_SET_SIZES		(1ULL << 1)
+
+/*
+ * Swap only the extents of the two files where the file allocation units
+ * mapped to file1's range have been written to.
+ */
+#define XFS_SWAP_EXT_INO1_WRITTEN	(1ULL << 2)
+
+/* Clear the reflink flag from inode1 after the operation. */
+#define XFS_SWAP_EXT_CLEAR_INO1_REFLINK	(1ULL << 3)
+
+/* Clear the reflink flag from inode2 after the operation. */
+#define XFS_SWAP_EXT_CLEAR_INO2_REFLINK	(1ULL << 4)
+
+#define XFS_SWAP_EXT_FLAGS		(XFS_SWAP_EXT_ATTR_FORK | \
+					 XFS_SWAP_EXT_SET_SIZES | \
+					 XFS_SWAP_EXT_INO1_WRITTEN | \
+					 XFS_SWAP_EXT_CLEAR_INO1_REFLINK | \
+					 XFS_SWAP_EXT_CLEAR_INO2_REFLINK)
+
+#define XFS_SWAP_EXT_STRINGS \
+	{ XFS_SWAP_EXT_ATTR_FORK,		"ATTRFORK" }, \
+	{ XFS_SWAP_EXT_SET_SIZES,		"SETSIZES" }, \
+	{ XFS_SWAP_EXT_INO1_WRITTEN,		"INO1_WRITTEN" }, \
+	{ XFS_SWAP_EXT_CLEAR_INO1_REFLINK,	"CLEAR_INO1_REFLINK" }, \
+	{ XFS_SWAP_EXT_CLEAR_INO2_REFLINK,	"CLEAR_INO2_REFLINK" }
 
 /* This is the structure used to lay out an sxi log item in the log. */
 struct xfs_sxi_log_format {
diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
new file mode 100644
index 00000000000..2462657c1f4
--- /dev/null
+++ b/libxfs/xfs_swapext.c
@@ -0,0 +1,1028 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_swapext.h"
+#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_quota_defs.h"
+#include "xfs_health.h"
+#include "defer_item.h"
+
+struct kmem_cache	*xfs_swapext_intent_cache;
+
+/* bmbt mappings adjacent to a pair of records. */
+struct xfs_swapext_adjacent {
+	struct xfs_bmbt_irec		left1;
+	struct xfs_bmbt_irec		right1;
+	struct xfs_bmbt_irec		left2;
+	struct xfs_bmbt_irec		right2;
+};
+
+#define ADJACENT_INIT { \
+	.left1  = { .br_startblock = HOLESTARTBLOCK }, \
+	.right1 = { .br_startblock = HOLESTARTBLOCK }, \
+	.left2  = { .br_startblock = HOLESTARTBLOCK }, \
+	.right2 = { .br_startblock = HOLESTARTBLOCK }, \
+}
+
+/* Information to help us reset reflink flag / CoW fork state after a swap. */
+
+/* Previous state of the two inodes' reflink flags. */
+#define XFS_REFLINK_STATE_IP1		(1U << 0)
+#define XFS_REFLINK_STATE_IP2		(1U << 1)
+
+/*
+ * If the reflink flag is set on either inode, make sure it has an incore CoW
+ * fork, since all reflink inodes must have them.  If there's a CoW fork and it
+ * has extents in it, make sure the inodes are tagged appropriately so that
+ * speculative preallocations can be GC'd if we run low of space.
+ */
+static inline void
+xfs_swapext_ensure_cowfork(
+	struct xfs_inode	*ip)
+{
+	struct xfs_ifork	*cfork;
+
+	if (xfs_is_reflink_inode(ip))
+		xfs_ifork_init_cow(ip);
+
+	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	if (!cfork)
+		return;
+	if (cfork->if_bytes > 0)
+		xfs_inode_set_cowblocks_tag(ip);
+	else
+		xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Adjust the on-disk inode size upwards if needed so that we never map extents
+ * into the file past EOF.  This is crucial so that log recovery won't get
+ * confused by the sudden appearance of post-eof extents.
+ */
+STATIC void
+xfs_swapext_update_size(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	xfs_fsize_t		new_isize)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_fsize_t		len;
+
+	if (new_isize < 0)
+		return;
+
+	len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
+		  new_isize);
+
+	if (len <= ip->i_disk_size)
+		return;
+
+	trace_xfs_swapext_update_inode_size(ip, len);
+
+	ip->i_disk_size = len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+static inline bool
+sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi)
+{
+	return sxi->sxi_blockcount > 0;
+}
+
+static inline bool
+sxi_has_postop_work(const struct xfs_swapext_intent *sxi)
+{
+	return sxi->sxi_flags & (XFS_SWAP_EXT_CLEAR_INO1_REFLINK |
+				 XFS_SWAP_EXT_CLEAR_INO2_REFLINK);
+}
+
+static inline void
+sxi_advance(
+	struct xfs_swapext_intent	*sxi,
+	const struct xfs_bmbt_irec	*irec)
+{
+	sxi->sxi_startoff1 += irec->br_blockcount;
+	sxi->sxi_startoff2 += irec->br_blockcount;
+	sxi->sxi_blockcount -= irec->br_blockcount;
+}
+
+/* Check all extents to make sure we can actually swap them. */
+int
+xfs_swapext_check_extents(
+	struct xfs_mount		*mp,
+	const struct xfs_swapext_req	*req)
+{
+	struct xfs_ifork		*ifp1, *ifp2;
+
+	/* No fork? */
+	ifp1 = xfs_ifork_ptr(req->ip1, req->whichfork);
+	ifp2 = xfs_ifork_ptr(req->ip2, req->whichfork);
+	if (!ifp1 || !ifp2)
+		return -EINVAL;
+
+	/* We don't know how to swap local format forks. */
+	if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
+	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
+		return -EINVAL;
+
+	/* We don't support realtime data forks yet. */
+	if (!XFS_IS_REALTIME_INODE(req->ip1))
+		return 0;
+	if (req->whichfork == XFS_ATTR_FORK)
+		return 0;
+	return -EINVAL;
+}
+
+#ifdef CONFIG_XFS_QUOTA
+/* Log the actual updates to the quota accounting. */
+static inline void
+xfs_swapext_update_quota(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi,
+	struct xfs_bmbt_irec		*irec1,
+	struct xfs_bmbt_irec		*irec2)
+{
+	int64_t				ip1_delta = 0, ip2_delta = 0;
+	unsigned int			qflag;
+
+	qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
+						      XFS_TRANS_DQ_BCOUNT;
+
+	if (xfs_bmap_is_real_extent(irec1)) {
+		ip1_delta -= irec1->br_blockcount;
+		ip2_delta += irec1->br_blockcount;
+	}
+
+	if (xfs_bmap_is_real_extent(irec2)) {
+		ip1_delta += irec2->br_blockcount;
+		ip2_delta -= irec2->br_blockcount;
+	}
+
+	xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta);
+	xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta);
+}
+#else
+# define xfs_swapext_update_quota(tp, sxi, irec1, irec2)	((void)0)
+#endif
+
+/* Decide if we want to skip this mapping from file1. */
+static inline bool
+xfs_swapext_can_skip_mapping(
+	struct xfs_swapext_intent	*sxi,
+	struct xfs_bmbt_irec		*irec)
+{
+	/* Do not skip this mapping if the caller did not tell us to. */
+	if (!(sxi->sxi_flags & XFS_SWAP_EXT_INO1_WRITTEN))
+		return false;
+
+	/* Do not skip mapped, written extents. */
+	if (xfs_bmap_is_written_extent(irec))
+		return false;
+
+	/*
+	 * The mapping is unwritten or a hole.  It cannot be a delalloc
+	 * reservation because we already excluded those.  It cannot be an
+	 * unwritten extent with dirty page cache because we flushed the page
+	 * cache.  We don't support realtime files yet, so we needn't (yet)
+	 * deal with them.
+	 */
+	return true;
+}
+
+/*
+ * Walk forward through the file ranges in @sxi until we find two different
+ * mappings to exchange.  If there is work to do, return the mappings;
+ * otherwise we've reached the end of the range and sxi_blockcount will be
+ * zero.
+ *
+ * If the walk skips over a pair of mappings to the same storage, save them as
+ * the left records in @adj (if provided) so that the simulation phase can
+ * avoid an extra lookup.
+  */
+static int
+xfs_swapext_find_mappings(
+	struct xfs_swapext_intent	*sxi,
+	struct xfs_bmbt_irec		*irec1,
+	struct xfs_bmbt_irec		*irec2,
+	struct xfs_swapext_adjacent	*adj)
+{
+	int				nimaps;
+	int				bmap_flags;
+	int				error;
+
+	bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi));
+
+	for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) {
+		/* Read extent from the first file */
+		nimaps = 1;
+		error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1,
+				sxi->sxi_blockcount, irec1, &nimaps,
+				bmap_flags);
+		if (error)
+			return error;
+		if (nimaps != 1 ||
+		    irec1->br_startblock == DELAYSTARTBLOCK ||
+		    irec1->br_startoff != sxi->sxi_startoff1) {
+			/*
+			 * We should never get no mapping or a delalloc extent
+			 * or something that doesn't match what we asked for,
+			 * since the caller flushed both inodes and we hold the
+			 * ILOCKs for both inodes.
+			 */
+			ASSERT(0);
+			return -EINVAL;
+		}
+
+		if (xfs_swapext_can_skip_mapping(sxi, irec1)) {
+			trace_xfs_swapext_extent1_skip(sxi->sxi_ip1, irec1);
+			continue;
+		}
+
+		/* Read extent from the second file */
+		nimaps = 1;
+		error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2,
+				irec1->br_blockcount, irec2, &nimaps,
+				bmap_flags);
+		if (error)
+			return error;
+		if (nimaps != 1 ||
+		    irec2->br_startblock == DELAYSTARTBLOCK ||
+		    irec2->br_startoff != sxi->sxi_startoff2) {
+			/*
+			 * We should never get no mapping or a delalloc extent
+			 * or something that doesn't match what we asked for,
+			 * since the caller flushed both inodes and we hold the
+			 * ILOCKs for both inodes.
+			 */
+			ASSERT(0);
+			return -EINVAL;
+		}
+
+		/*
+		 * We can only swap as many blocks as the smaller of the two
+		 * extent maps.
+		 */
+		irec1->br_blockcount = min(irec1->br_blockcount,
+					   irec2->br_blockcount);
+
+		trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
+		trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2);
+
+		/* We found something to swap, so return it. */
+		if (irec1->br_startblock != irec2->br_startblock)
+			return 0;
+
+		/*
+		 * Two extents mapped to the same physical block must not have
+		 * different states; that's filesystem corruption.  Move on to
+		 * the next extent if they're both holes or both the same
+		 * physical extent.
+		 */
+		if (irec1->br_state != irec2->br_state) {
+			xfs_bmap_mark_sick(sxi->sxi_ip1,
+					xfs_swapext_whichfork(sxi));
+			xfs_bmap_mark_sick(sxi->sxi_ip2,
+					xfs_swapext_whichfork(sxi));
+			return -EFSCORRUPTED;
+		}
+
+		/*
+		 * Save the mappings if we're estimating work and skipping
+		 * these identical mappings.
+		 */
+		if (adj) {
+			memcpy(&adj->left1, irec1, sizeof(*irec1));
+			memcpy(&adj->left2, irec2, sizeof(*irec2));
+		}
+	}
+
+	return 0;
+}
+
+/* Exchange these two mappings. */
+static void
+xfs_swapext_exchange_mappings(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi,
+	struct xfs_bmbt_irec		*irec1,
+	struct xfs_bmbt_irec		*irec2)
+{
+	int				whichfork = xfs_swapext_whichfork(sxi);
+
+	xfs_swapext_update_quota(tp, sxi, irec1, irec2);
+
+	/* Remove both mappings. */
+	xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1);
+	xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2);
+
+	/*
+	 * Re-add both mappings.  We swap the file offsets between the two maps
+	 * and add the opposite map, which has the effect of filling the
+	 * logical offsets we just unmapped, but with with the physical mapping
+	 * information swapped.
+	 */
+	swap(irec1->br_startoff, irec2->br_startoff);
+	xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2);
+	xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1);
+
+	/* Make sure we're not mapping extents past EOF. */
+	if (whichfork == XFS_DATA_FORK) {
+		xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2,
+				sxi->sxi_isize1);
+		xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1,
+				sxi->sxi_isize2);
+	}
+
+	/*
+	 * Advance our cursor and exit.   The caller (either defer ops or log
+	 * recovery) will log the SXD item, and if *blockcount is nonzero, it
+	 * will log a new SXI item for the remainder and call us back.
+	 */
+	sxi_advance(sxi, irec1);
+}
+
+static inline void
+xfs_swapext_clear_reflink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	trace_xfs_reflink_unset_inode_flag(ip);
+
+	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Finish whatever work might come after a swap operation. */
+static int
+xfs_swapext_do_postop_work(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	if (sxi->sxi_flags & XFS_SWAP_EXT_CLEAR_INO1_REFLINK) {
+		xfs_swapext_clear_reflink(tp, sxi->sxi_ip1);
+		sxi->sxi_flags &= ~XFS_SWAP_EXT_CLEAR_INO1_REFLINK;
+	}
+
+	if (sxi->sxi_flags & XFS_SWAP_EXT_CLEAR_INO2_REFLINK) {
+		xfs_swapext_clear_reflink(tp, sxi->sxi_ip2);
+		sxi->sxi_flags &= ~XFS_SWAP_EXT_CLEAR_INO2_REFLINK;
+	}
+
+	return 0;
+}
+
+/* Finish one extent swap, possibly log more. */
+int
+xfs_swapext_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_bmbt_irec		irec1, irec2;
+	int				error;
+
+	if (sxi_has_more_swap_work(sxi)) {
+		/*
+		 * If the operation state says that some range of the files
+		 * have not yet been swapped, look for extents in that range to
+		 * swap.  If we find some extents, swap them.
+		 */
+		error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL);
+		if (error)
+			return error;
+
+		if (sxi_has_more_swap_work(sxi))
+			xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2);
+
+		/*
+		 * If the caller asked us to exchange the file sizes after the
+		 * swap and either we just swapped the last extents in the
+		 * range or we didn't find anything to swap, update the ondisk
+		 * file sizes.
+		 */
+		if ((sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES) &&
+		    !sxi_has_more_swap_work(sxi)) {
+			sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1;
+			sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2;
+
+			xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE);
+			xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE);
+		}
+	} else if (sxi_has_postop_work(sxi)) {
+		/*
+		 * Now that we're finished with the swap operation, complete
+		 * the post-op cleanup work.
+		 */
+		error = xfs_swapext_do_postop_work(tp, sxi);
+		if (error)
+			return error;
+	}
+
+	/* If we still have work to do, ask for a new transaction. */
+	if (sxi_has_more_swap_work(sxi) || sxi_has_postop_work(sxi)) {
+		trace_xfs_swapext_defer(tp->t_mountp, sxi);
+		return -EAGAIN;
+	}
+
+	/*
+	 * If we reach here, we've finished all the swapping work and the post
+	 * operation work.  The last thing we need to do before returning to
+	 * the caller is to make sure that COW forks are set up correctly.
+	 */
+	if (!(sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)) {
+		xfs_swapext_ensure_cowfork(sxi->sxi_ip1);
+		xfs_swapext_ensure_cowfork(sxi->sxi_ip2);
+	}
+
+	return 0;
+}
+
+/*
+ * Compute the amount of bmbt blocks we should reserve for each file.  In the
+ * worst case, each exchange will fill a hole with a new mapping, which could
+ * result in a btree split every time we add a new leaf block.
+ */
+static inline uint64_t
+xfs_swapext_bmbt_blocks(
+	struct xfs_mount		*mp,
+	const struct xfs_swapext_req	*req)
+{
+	return howmany_64(req->nr_exchanges,
+					XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) *
+			XFS_EXTENTADD_SPACE_RES(mp, req->whichfork);
+}
+
+static inline uint64_t
+xfs_swapext_rmapbt_blocks(
+	struct xfs_mount		*mp,
+	const struct xfs_swapext_req	*req)
+{
+	if (!xfs_has_rmapbt(mp))
+		return 0;
+	if (XFS_IS_REALTIME_INODE(req->ip1))
+		return 0;
+
+	return howmany_64(req->nr_exchanges,
+					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
+			XFS_RMAPADD_SPACE_RES(mp);
+}
+
+/* Estimate the bmbt and rmapbt overhead required to exchange extents. */
+static int
+xfs_swapext_estimate_overhead(
+	struct xfs_swapext_req	*req)
+{
+	struct xfs_mount	*mp = req->ip1->i_mount;
+	xfs_filblks_t		bmbt_blocks;
+	xfs_filblks_t		rmapbt_blocks;
+	xfs_filblks_t		resblks = req->resblks;
+
+	/*
+	 * Compute the number of bmbt and rmapbt blocks we might need to handle
+	 * the estimated number of exchanges.
+	 */
+	bmbt_blocks = xfs_swapext_bmbt_blocks(mp, req);
+	rmapbt_blocks = xfs_swapext_rmapbt_blocks(mp, req);
+
+	trace_xfs_swapext_overhead(mp, bmbt_blocks, rmapbt_blocks);
+
+	/* Make sure the change in file block count doesn't overflow. */
+	if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount))
+		return -EFBIG;
+	if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount))
+		return -EFBIG;
+
+	/*
+	 * Add together the number of blocks we need to handle btree growth,
+	 * then add it to the number of blocks we need to reserve to this
+	 * transaction.
+	 */
+	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+		return -ENOSPC;
+	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
+		return -ENOSPC;
+	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+		return -ENOSPC;
+	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
+		return -ENOSPC;
+
+	/* Can't actually reserve more than UINT_MAX blocks. */
+	if (req->resblks > UINT_MAX)
+		return -ENOSPC;
+
+	req->resblks = resblks;
+	trace_xfs_swapext_final_estimate(req);
+	return 0;
+}
+
+/* Decide if we can merge two real extents. */
+static inline bool
+can_merge(
+	const struct xfs_bmbt_irec	*b1,
+	const struct xfs_bmbt_irec	*b2)
+{
+	/* Don't merge holes. */
+	if (b1->br_startblock == HOLESTARTBLOCK ||
+	    b2->br_startblock == HOLESTARTBLOCK)
+		return false;
+
+	/* We don't merge holes. */
+	if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
+		return false;
+
+	if (b1->br_startoff   + b1->br_blockcount == b2->br_startoff &&
+	    b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
+	    b1->br_state			  == b2->br_state &&
+	    b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+		return true;
+
+	return false;
+}
+
+#define CLEFT_CONTIG	0x01
+#define CRIGHT_CONTIG	0x02
+#define CHOLE		0x04
+#define CBOTH_CONTIG	(CLEFT_CONTIG | CRIGHT_CONTIG)
+
+#define NLEFT_CONTIG	0x10
+#define NRIGHT_CONTIG	0x20
+#define NHOLE		0x40
+#define NBOTH_CONTIG	(NLEFT_CONTIG | NRIGHT_CONTIG)
+
+/* Estimate the effect of a single swap on extent count. */
+static inline int
+delta_nextents_step(
+	struct xfs_mount		*mp,
+	const struct xfs_bmbt_irec	*left,
+	const struct xfs_bmbt_irec	*curr,
+	const struct xfs_bmbt_irec	*new,
+	const struct xfs_bmbt_irec	*right)
+{
+	bool				lhole, rhole, chole, nhole;
+	unsigned int			state = 0;
+	int				ret = 0;
+
+	lhole = left->br_startblock == HOLESTARTBLOCK;
+	rhole = right->br_startblock == HOLESTARTBLOCK;
+	chole = curr->br_startblock == HOLESTARTBLOCK;
+	nhole = new->br_startblock == HOLESTARTBLOCK;
+
+	if (chole)
+		state |= CHOLE;
+	if (!lhole && !chole && can_merge(left, curr))
+		state |= CLEFT_CONTIG;
+	if (!rhole && !chole && can_merge(curr, right))
+		state |= CRIGHT_CONTIG;
+	if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
+	    left->br_startblock + curr->br_startblock +
+					right->br_startblock > XFS_MAX_BMBT_EXTLEN)
+		state &= ~CRIGHT_CONTIG;
+
+	if (nhole)
+		state |= NHOLE;
+	if (!lhole && !nhole && can_merge(left, new))
+		state |= NLEFT_CONTIG;
+	if (!rhole && !nhole && can_merge(new, right))
+		state |= NRIGHT_CONTIG;
+	if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
+	    left->br_startblock + new->br_startblock +
+					right->br_startblock > XFS_MAX_BMBT_EXTLEN)
+		state &= ~NRIGHT_CONTIG;
+
+	switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
+	case CLEFT_CONTIG | CRIGHT_CONTIG:
+		/*
+		 * left/curr/right are the same extent, so deleting curr causes
+		 * 2 new extents to be created.
+		 */
+		ret += 2;
+		break;
+	case 0:
+		/*
+		 * curr is not contiguous with any extent, so we remove curr
+		 * completely
+		 */
+		ret--;
+		break;
+	case CHOLE:
+		/* hole, do nothing */
+		break;
+	case CLEFT_CONTIG:
+	case CRIGHT_CONTIG:
+		/* trim either left or right, no change */
+		break;
+	}
+
+	switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
+	case NLEFT_CONTIG | NRIGHT_CONTIG:
+		/*
+		 * left/curr/right will become the same extent, so adding
+		 * curr causes the deletion of right.
+		 */
+		ret--;
+		break;
+	case 0:
+		/* new is not contiguous with any extent */
+		ret++;
+		break;
+	case NHOLE:
+		/* hole, do nothing. */
+		break;
+	case NLEFT_CONTIG:
+	case NRIGHT_CONTIG:
+		/* new is absorbed into left or right, no change */
+		break;
+	}
+
+	trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret,
+			state);
+	return ret;
+}
+
+/* Make sure we don't overflow the extent counters. */
+static inline int
+ensure_delta_nextents(
+	struct xfs_swapext_req	*req,
+	struct xfs_inode	*ip,
+	int64_t			delta)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, req->whichfork);
+	xfs_extnum_t		max_extents;
+	bool			large_extcount;
+
+	if (delta < 0)
+		return 0;
+
+	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) {
+		if (ifp->if_nextents + delta > 10)
+			return -EFBIG;
+	}
+
+	if (req->req_flags & XFS_SWAP_REQ_NREXT64)
+		large_extcount = true;
+	else
+		large_extcount = xfs_inode_has_large_extent_counts(ip);
+
+	max_extents = xfs_iext_max_nextents(large_extcount, req->whichfork);
+	if (ifp->if_nextents + delta <= max_extents)
+		return 0;
+	if (large_extcount)
+		return -EFBIG;
+	if (!xfs_has_large_extent_counts(mp))
+		return -EFBIG;
+
+	max_extents = xfs_iext_max_nextents(true, req->whichfork);
+	if (ifp->if_nextents + delta > max_extents)
+		return -EFBIG;
+
+	req->req_flags |= XFS_SWAP_REQ_NREXT64;
+	return 0;
+}
+
+/* Find the next extent after irec. */
+static inline int
+get_next_ext(
+	struct xfs_inode		*ip,
+	int				bmap_flags,
+	const struct xfs_bmbt_irec	*irec,
+	struct xfs_bmbt_irec		*nrec)
+{
+	xfs_fileoff_t			off;
+	xfs_filblks_t			blockcount;
+	int				nimaps = 1;
+	int				error;
+
+	off = irec->br_startoff + irec->br_blockcount;
+	blockcount = XFS_MAX_FILEOFF - off;
+	error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
+	if (error)
+		return error;
+	if (nrec->br_startblock == DELAYSTARTBLOCK ||
+	    nrec->br_startoff != off) {
+		/*
+		 * If we don't get the extent we want, return a zero-length
+		 * mapping, which our estimator function will pretend is a hole.
+		 * We shouldn't get delalloc reservations.
+		 */
+		nrec->br_startblock = HOLESTARTBLOCK;
+	}
+
+	return 0;
+}
+
+int __init
+xfs_swapext_intent_init_cache(void)
+{
+	xfs_swapext_intent_cache = kmem_cache_create("xfs_swapext_intent",
+			sizeof(struct xfs_swapext_intent),
+			0, 0, NULL);
+
+	return xfs_swapext_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_swapext_intent_destroy_cache(void)
+{
+	kmem_cache_destroy(xfs_swapext_intent_cache);
+	xfs_swapext_intent_cache = NULL;
+}
+
+/*
+ * Decide if we will swap the reflink flags between the two files after the
+ * swap.  The only time we want to do this is if we're exchanging all extents
+ * under EOF and the inode reflink flags have different states.
+ */
+static inline bool
+sxi_can_exchange_reflink_flags(
+	const struct xfs_swapext_req	*req,
+	unsigned int			reflink_state)
+{
+	struct xfs_mount		*mp = req->ip1->i_mount;
+
+	if (hweight32(reflink_state) != 1)
+		return false;
+	if (req->startoff1 != 0 || req->startoff2 != 0)
+		return false;
+	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size))
+		return false;
+	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
+		return false;
+	return true;
+}
+
+
+/* Allocate and initialize a new incore intent item from a request. */
+struct xfs_swapext_intent *
+xfs_swapext_init_intent(
+	const struct xfs_swapext_req	*req,
+	unsigned int			*reflink_state)
+{
+	struct xfs_swapext_intent	*sxi;
+	unsigned int			rs = 0;
+
+	sxi = kmem_cache_zalloc(xfs_swapext_intent_cache,
+			GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&sxi->sxi_list);
+	sxi->sxi_ip1 = req->ip1;
+	sxi->sxi_ip2 = req->ip2;
+	sxi->sxi_startoff1 = req->startoff1;
+	sxi->sxi_startoff2 = req->startoff2;
+	sxi->sxi_blockcount = req->blockcount;
+	sxi->sxi_isize1 = sxi->sxi_isize2 = -1;
+
+	if (req->whichfork == XFS_ATTR_FORK)
+		sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK;
+
+	if (req->whichfork == XFS_DATA_FORK &&
+	    (req->req_flags & XFS_SWAP_REQ_SET_SIZES)) {
+		sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES;
+		sxi->sxi_isize1 = req->ip2->i_disk_size;
+		sxi->sxi_isize2 = req->ip1->i_disk_size;
+	}
+
+	if (req->req_flags & XFS_SWAP_REQ_INO1_WRITTEN)
+		sxi->sxi_flags |= XFS_SWAP_EXT_INO1_WRITTEN;
+
+	if (req->req_flags & XFS_SWAP_REQ_LOGGED)
+		sxi->sxi_op_flags |= XFS_SWAP_EXT_OP_LOGGED;
+	if (req->req_flags & XFS_SWAP_REQ_NREXT64)
+		sxi->sxi_op_flags |= XFS_SWAP_EXT_OP_NREXT64;
+
+	if (req->whichfork == XFS_DATA_FORK) {
+		/*
+		 * Record the state of each inode's reflink flag before the
+		 * operation.
+		 */
+		if (xfs_is_reflink_inode(req->ip1))
+			rs |= XFS_REFLINK_STATE_IP1;
+		if (xfs_is_reflink_inode(req->ip2))
+			rs |= XFS_REFLINK_STATE_IP2;
+
+		/*
+		 * Figure out if we're clearing the reflink flags (which
+		 * effectively swaps them) after the operation.
+		 */
+		if (sxi_can_exchange_reflink_flags(req, rs)) {
+			if (rs & XFS_REFLINK_STATE_IP1)
+				sxi->sxi_flags |=
+						XFS_SWAP_EXT_CLEAR_INO1_REFLINK;
+			if (rs & XFS_REFLINK_STATE_IP2)
+				sxi->sxi_flags |=
+						XFS_SWAP_EXT_CLEAR_INO2_REFLINK;
+		}
+	}
+
+	if (reflink_state)
+		*reflink_state = rs;
+	return sxi;
+}
+
+/*
+ * Estimate the number of exchange operations and the number of file blocks
+ * in each file that will be affected by the exchange operation.
+ */
+int
+xfs_swapext_estimate(
+	struct xfs_swapext_req		*req)
+{
+	struct xfs_swapext_intent	*sxi;
+	struct xfs_bmbt_irec		irec1, irec2;
+	struct xfs_swapext_adjacent	adj = ADJACENT_INIT;
+	xfs_filblks_t			ip1_blocks = 0, ip2_blocks = 0;
+	int64_t				d_nexts1, d_nexts2;
+	int				bmap_flags;
+	int				error;
+
+	ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
+
+	bmap_flags = xfs_bmapi_aflag(req->whichfork);
+	sxi = xfs_swapext_init_intent(req, NULL);
+
+	/*
+	 * To guard against the possibility of overflowing the extent counters,
+	 * we have to estimate an upper bound on the potential increase in that
+	 * counter.  We can split the extent at each end of the range, and for
+	 * each step of the swap we can split the extent that we're working on
+	 * if the extents do not align.
+	 */
+	d_nexts1 = d_nexts2 = 3;
+
+	while (sxi_has_more_swap_work(sxi)) {
+		/*
+		 * Walk through the file ranges until we find something to
+		 * swap.  Because we're simulating the swap, pass in adj to
+		 * capture skipped mappings for correct estimation of bmbt
+		 * record merges.
+		 */
+		error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj);
+		if (error)
+			goto out_free;
+		if (!sxi_has_more_swap_work(sxi))
+			break;
+
+		/* Update accounting. */
+		if (xfs_bmap_is_real_extent(&irec1))
+			ip1_blocks += irec1.br_blockcount;
+		if (xfs_bmap_is_real_extent(&irec2))
+			ip2_blocks += irec2.br_blockcount;
+		req->nr_exchanges++;
+
+		/* Read the next extents from both files. */
+		error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1);
+		if (error)
+			goto out_free;
+
+		error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2);
+		if (error)
+			goto out_free;
+
+		/* Update extent count deltas. */
+		d_nexts1 += delta_nextents_step(req->ip1->i_mount,
+				&adj.left1, &irec1, &irec2, &adj.right1);
+
+		d_nexts2 += delta_nextents_step(req->ip1->i_mount,
+				&adj.left2, &irec2, &irec1, &adj.right2);
+
+		/* Now pretend we swapped the extents. */
+		if (can_merge(&adj.left2, &irec1))
+			adj.left2.br_blockcount += irec1.br_blockcount;
+		else
+			memcpy(&adj.left2, &irec1, sizeof(irec1));
+
+		if (can_merge(&adj.left1, &irec2))
+			adj.left1.br_blockcount += irec2.br_blockcount;
+		else
+			memcpy(&adj.left1, &irec2, sizeof(irec2));
+
+		sxi_advance(sxi, &irec1);
+	}
+
+	/* Account for the blocks that are being exchanged. */
+	if (XFS_IS_REALTIME_INODE(req->ip1) &&
+	    req->whichfork == XFS_DATA_FORK) {
+		req->ip1_rtbcount = ip1_blocks;
+		req->ip2_rtbcount = ip2_blocks;
+	} else {
+		req->ip1_bcount = ip1_blocks;
+		req->ip2_bcount = ip2_blocks;
+	}
+
+	/*
+	 * Make sure that both forks have enough slack left in their extent
+	 * counters that the swap operation will not overflow.
+	 */
+	trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2);
+	if (req->ip1 == req->ip2) {
+		error = ensure_delta_nextents(req, req->ip1,
+				d_nexts1 + d_nexts2);
+	} else {
+		error = ensure_delta_nextents(req, req->ip1, d_nexts1);
+		if (error)
+			goto out_free;
+		error = ensure_delta_nextents(req, req->ip2, d_nexts2);
+	}
+	if (error)
+		goto out_free;
+
+	trace_xfs_swapext_initial_estimate(req);
+	error = xfs_swapext_estimate_overhead(req);
+out_free:
+	kmem_cache_free(xfs_swapext_intent_cache, sxi);
+	return error;
+}
+
+static inline void
+xfs_swapext_set_reflink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	trace_xfs_reflink_set_inode_flag(ip);
+
+	ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * If either file has shared blocks and we're swapping data forks, we must flag
+ * the other file as having shared blocks so that we get the shared-block rmap
+ * functions if we need to fix up the rmaps.
+ */
+void
+xfs_swapext_ensure_reflink(
+	struct xfs_trans		*tp,
+	const struct xfs_swapext_intent	*sxi,
+	unsigned int			reflink_state)
+{
+	if ((reflink_state & XFS_REFLINK_STATE_IP1) &&
+	    !xfs_is_reflink_inode(sxi->sxi_ip2))
+		xfs_swapext_set_reflink(tp, sxi->sxi_ip2);
+
+	if ((reflink_state & XFS_REFLINK_STATE_IP2) &&
+	    !xfs_is_reflink_inode(sxi->sxi_ip1))
+		xfs_swapext_set_reflink(tp, sxi->sxi_ip1);
+}
+
+/* Widen the extent counts of both inodes if necessary. */
+static inline void
+xfs_swapext_upgrade_extent_counts(
+	struct xfs_trans		*tp,
+	const struct xfs_swapext_intent	*sxi)
+{
+	if (!(sxi->sxi_op_flags & XFS_SWAP_EXT_OP_NREXT64))
+		return;
+
+	sxi->sxi_ip1->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE);
+
+	sxi->sxi_ip2->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE);
+}
+
+/*
+ * Schedule a swap a range of extents from one inode to another.  If the atomic
+ * swap feature is enabled, then the operation progress can be resumed even if
+ * the system goes down.  The caller must commit the transaction to start the
+ * work.
+ *
+ * The caller must ensure the inodes must be joined to the transaction and
+ * ILOCKd; they will still be joined to the transaction at exit.
+ */
+void
+xfs_swapext(
+	struct xfs_trans		*tp,
+	const struct xfs_swapext_req	*req)
+{
+	struct xfs_swapext_intent	*sxi;
+	unsigned int			reflink_state;
+
+	ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL));
+	ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL));
+	ASSERT(req->whichfork != XFS_COW_FORK);
+	ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
+	if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
+		ASSERT(req->whichfork == XFS_DATA_FORK);
+
+	if (req->blockcount == 0)
+		return;
+
+	sxi = xfs_swapext_init_intent(req, &reflink_state);
+	xfs_swapext_defer_add(tp, sxi);
+	xfs_swapext_ensure_reflink(tp, sxi, reflink_state);
+	xfs_swapext_upgrade_extent_counts(tp, sxi);
+}
diff --git a/libxfs/xfs_swapext.h b/libxfs/xfs_swapext.h
index 01bb3271f64..fa786bc9352 100644
--- a/libxfs/xfs_swapext.h
+++ b/libxfs/xfs_swapext.h
@@ -72,4 +72,147 @@ xfs_atomic_swap_supported(
 	return false;
 }
 
+/*
+ * In-core information about an extent swap request between ranges of two
+ * inodes.
+ */
+struct xfs_swapext_intent {
+	/* List of other incore deferred work. */
+	struct list_head	sxi_list;
+
+	/* Inodes participating in the operation. */
+	struct xfs_inode	*sxi_ip1;
+	struct xfs_inode	*sxi_ip2;
+
+	/* File offset range information. */
+	xfs_fileoff_t		sxi_startoff1;
+	xfs_fileoff_t		sxi_startoff2;
+	xfs_filblks_t		sxi_blockcount;
+
+	/* Set these file sizes after the operation, unless negative. */
+	xfs_fsize_t		sxi_isize1;
+	xfs_fsize_t		sxi_isize2;
+
+	/* XFS_SWAP_EXT_* log operation flags */
+	unsigned int		sxi_flags;
+
+	/* XFS_SWAP_EXT_OP_* flags */
+	unsigned int		sxi_op_flags;
+};
+
+/* Use log intent items to track and restart the entire operation. */
+#define XFS_SWAP_EXT_OP_LOGGED	(1U << 0)
+
+/* Upgrade files to have large extent counts before proceeding. */
+#define XFS_SWAP_EXT_OP_NREXT64	(1U << 1)
+
+#define XFS_SWAP_EXT_OP_STRINGS \
+	{ XFS_SWAP_EXT_OP_LOGGED,		"LOGGED" }, \
+	{ XFS_SWAP_EXT_OP_NREXT64,		"NREXT64" }
+
+static inline int
+xfs_swapext_whichfork(const struct xfs_swapext_intent *sxi)
+{
+	if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)
+		return XFS_ATTR_FORK;
+	return XFS_DATA_FORK;
+}
+
+/* Parameters for a swapext request. */
+struct xfs_swapext_req {
+	/* Inodes participating in the operation. */
+	struct xfs_inode	*ip1;
+	struct xfs_inode	*ip2;
+
+	/* File offset range information. */
+	xfs_fileoff_t		startoff1;
+	xfs_fileoff_t		startoff2;
+	xfs_filblks_t		blockcount;
+
+	/* Data or attr fork? */
+	int			whichfork;
+
+	/* XFS_SWAP_REQ_* operation flags */
+	unsigned int		req_flags;
+
+	/*
+	 * Fields below this line are filled out by xfs_swapext_estimate;
+	 * callers should initialize this part of the struct to zero.
+	 */
+
+	/*
+	 * Data device blocks to be moved out of ip1, and free space needed to
+	 * handle the bmbt changes.
+	 */
+	xfs_filblks_t		ip1_bcount;
+
+	/*
+	 * Data device blocks to be moved out of ip2, and free space needed to
+	 * handle the bmbt changes.
+	 */
+	xfs_filblks_t		ip2_bcount;
+
+	/* rt blocks to be moved out of ip1. */
+	xfs_filblks_t		ip1_rtbcount;
+
+	/* rt blocks to be moved out of ip2. */
+	xfs_filblks_t		ip2_rtbcount;
+
+	/* Free space needed to handle the bmbt changes */
+	unsigned long long	resblks;
+
+	/* Number of extent swaps needed to complete the operation */
+	unsigned long long	nr_exchanges;
+};
+
+/* Caller has permission to use log intent items for the swapext operation. */
+#define XFS_SWAP_REQ_LOGGED		(1U << 0)
+
+/* Set the file sizes when finished. */
+#define XFS_SWAP_REQ_SET_SIZES		(1U << 1)
+
+/*
+ * Swap only the parts of the two files where the file allocation units
+ * mapped to file1's range have been written to.
+ */
+#define XFS_SWAP_REQ_INO1_WRITTEN	(1U << 2)
+
+/* Files need to be upgraded to have large extent counts. */
+#define XFS_SWAP_REQ_NREXT64		(1U << 3)
+
+#define XFS_SWAP_REQ_FLAGS		(XFS_SWAP_REQ_LOGGED | \
+					 XFS_SWAP_REQ_SET_SIZES | \
+					 XFS_SWAP_REQ_INO1_WRITTEN | \
+					 XFS_SWAP_REQ_NREXT64)
+
+#define XFS_SWAP_REQ_STRINGS \
+	{ XFS_SWAP_REQ_LOGGED,		"LOGGED" }, \
+	{ XFS_SWAP_REQ_SET_SIZES,	"SETSIZES" }, \
+	{ XFS_SWAP_REQ_INO1_WRITTEN,	"INO1_WRITTEN" }, \
+	{ XFS_SWAP_REQ_NREXT64,		"NREXT64" }
+
+unsigned int xfs_swapext_reflink_prep(const struct xfs_swapext_req *req);
+void xfs_swapext_reflink_finish(struct xfs_trans *tp,
+		const struct xfs_swapext_req *req, unsigned int reflink_state);
+
+int xfs_swapext_estimate(struct xfs_swapext_req *req);
+
+extern struct kmem_cache	*xfs_swapext_intent_cache;
+
+int __init xfs_swapext_intent_init_cache(void);
+void xfs_swapext_intent_destroy_cache(void);
+
+struct xfs_swapext_intent *xfs_swapext_init_intent(
+		const struct xfs_swapext_req *req, unsigned int *reflink_state);
+void xfs_swapext_ensure_reflink(struct xfs_trans *tp,
+		const struct xfs_swapext_intent *sxi, unsigned int reflink_state);
+
+int xfs_swapext_finish_one(struct xfs_trans *tp,
+		struct xfs_swapext_intent *sxi);
+
+int xfs_swapext_check_extents(struct xfs_mount *mp,
+		const struct xfs_swapext_req *req);
+
+void xfs_swapext(struct xfs_trans *tp, const struct xfs_swapext_req *req);
+
 #endif /* __XFS_SWAPEXT_H_ */
diff --git a/libxfs/xfs_trans_space.h b/libxfs/xfs_trans_space.h
index 87b31c69a77..9640fc232c1 100644
--- a/libxfs/xfs_trans_space.h
+++ b/libxfs/xfs_trans_space.h
@@ -10,6 +10,10 @@
  * Components of space reservations.
  */
 
+/* Worst case number of bmaps that can be held in a block. */
+#define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0]))
+
 /* Worst case number of rmaps that can be held in a block. */
 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))

From patchwork Sun Dec 31 22:28:52 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507884
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 67F80C12D
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:28:53 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="l6Z6Vwj6"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id EF6E5C433C8;
	Sun, 31 Dec 2023 22:28:52 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061733;
	bh=tsvnwEVO/QXbM70MZomi3i2wTBN/R5pxNAGsM0x+awg=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=l6Z6Vwj6r+J3aJ8k5DedddjNBacG6SNjaZD/AVZZV0UXIzqt4zvXnCcAmzpE8KhTM
	 +dsmgHloVVRbziAriMMrL5UUm/K06mu7gDddehVYwKg3I5B4roCxj79xMLCZahWRYK
	 wQVJb3ymm8OutPPvYeJbgLWPc6Sisrm3g+dbn2i6EQQMvjvcI1bDijMik4Qou5jZNk
	 1KYsxSXMDs0Fi3kkebnA+vg4QM9RhXk/e6uoObEOglDQYR1SpHcK/eR4P1b6c5Saiu
	 fYjSpgtpxsolrovp+fs8cfFwgOKYzWvv3zOscpF3gEuTVYkBL9PltG82pcYV4dSXyY
	 N+AIxHZa42NDA==
Date: Sun, 31 Dec 2023 14:28:52 -0800
Subject: [PATCH 07/20] xfs: add error injection to test swapext recovery
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996368.1796128.5472649324233917879.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add an errortag so that we can test recovery of swapext log items.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/inject.c           |    1 +
 libxfs/xfs_errortag.h |    4 +++-
 libxfs/xfs_swapext.c  |    4 ++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/io/inject.c b/io/inject.c
index 6ef1fc8d2f4..4b0cd76005c 100644
--- a/io/inject.c
+++ b/io/inject.c
@@ -63,6 +63,7 @@ error_tag(char *name)
 		{ XFS_ERRTAG_ATTR_LEAF_TO_NODE,		"attr_leaf_to_node" },
 		{ XFS_ERRTAG_WB_DELAY_MS,		"wb_delay_ms" },
 		{ XFS_ERRTAG_WRITE_DELAY_MS,		"write_delay_ms" },
+		{ XFS_ERRTAG_SWAPEXT_FINISH_ONE,	"swapext_finish_one" },
 		{ XFS_ERRTAG_MAX,			NULL }
 	};
 	int	count;
diff --git a/libxfs/xfs_errortag.h b/libxfs/xfs_errortag.h
index 01a9e86b303..263d62a8d70 100644
--- a/libxfs/xfs_errortag.h
+++ b/libxfs/xfs_errortag.h
@@ -63,7 +63,8 @@
 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE			41
 #define XFS_ERRTAG_WB_DELAY_MS				42
 #define XFS_ERRTAG_WRITE_DELAY_MS			43
-#define XFS_ERRTAG_MAX					44
+#define XFS_ERRTAG_SWAPEXT_FINISH_ONE			44
+#define XFS_ERRTAG_MAX					45
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -111,5 +112,6 @@
 #define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
 #define XFS_RANDOM_WB_DELAY_MS				3000
 #define XFS_RANDOM_WRITE_DELAY_MS			3000
+#define XFS_RANDOM_SWAPEXT_FINISH_ONE			1
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index 2462657c1f4..5de586c6816 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -21,6 +21,7 @@
 #include "xfs_quota_defs.h"
 #include "xfs_health.h"
 #include "defer_item.h"
+#include "xfs_errortag.h"
 
 struct kmem_cache	*xfs_swapext_intent_cache;
 
@@ -433,6 +434,9 @@ xfs_swapext_finish_one(
 			return error;
 	}
 
+	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_SWAPEXT_FINISH_ONE))
+		return -EIO;
+
 	/* If we still have work to do, ask for a new transaction. */
 	if (sxi_has_more_swap_work(sxi) || sxi_has_postop_work(sxi)) {
 		trace_xfs_swapext_defer(tp->t_mountp, sxi);

From patchwork Sun Dec 31 22:29:08 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507885
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EFA82D527
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:29:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="c9W8fEJB"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id A483AC433C8;
	Sun, 31 Dec 2023 22:29:08 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061748;
	bh=uV5TT6Kb8m2AEQdeXe5hinuY2kAnm9+cXxGq4s6RcTE=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=c9W8fEJBZHO0m/gbFIh5UiD60hlZRpPJy7YuHKZKBLvi4UcauXKvHJ7RnUflfLl7c
	 NLZJzVv1n4ScZZPudBoEMMnoxNiDwuMBEJOr+rp7Tk1MJ/jaP+9flQNGT6irfT2S44
	 epvoBDCAk5lp0Hh1Mv5l7mmlDz/3uVATBuXi0b0n6esDPutdcmvpZMVUxeFMiaDth3
	 bclICRZ7MWouVVBnAdbOmJWy0XZFiZ7lvFrUiw4vPikH3pjCvmTLHimvqOxkd1ZGf2
	 oZDcN1enxFMZqMYDQsh5mT7dasaoCnGHy6hTYq6z+L1oyJU3ztn6zxQMriPvMTcpPs
	 wpI2Vo/ftH7pQ==
Date: Sun, 31 Dec 2023 14:29:08 -0800
Subject: [PATCH 08/20] xfs: condense extended attributes after an atomic swap
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996381.1796128.138020942106712640.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a new swapext flag that enables us to perform post-swap processing
on file2 once we're done swapping the extent maps.  If we were swapping
the extended attributes, we want to be able to convert file2's attr fork
from block to inline format.

This isn't used anywhere right now, but we need to have the basic ondisk
flags in place so that a future online xattr repair feature can create
salvaged attrs in a temporary file and swap the attr forks when ready.
If one file is in extents format and the other is inline, we will have to
promote both to extents format to perform the swap.  After the swap, we
can try to condense the fixed file's attr fork back down to inline
format if possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_log_format.h |    9 ++++++--
 libxfs/xfs_swapext.c    |   51 ++++++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_swapext.h    |    9 ++++++--
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/libxfs/xfs_log_format.h b/libxfs/xfs_log_format.h
index 3341792cf43..d4531060b6b 100644
--- a/libxfs/xfs_log_format.h
+++ b/libxfs/xfs_log_format.h
@@ -916,18 +916,23 @@ struct xfs_swap_extent {
 /* Clear the reflink flag from inode2 after the operation. */
 #define XFS_SWAP_EXT_CLEAR_INO2_REFLINK	(1ULL << 4)
 
+/* Try to convert inode2 from block to short format at the end, if possible. */
+#define XFS_SWAP_EXT_CVT_INO2_SF	(1ULL << 5)
+
 #define XFS_SWAP_EXT_FLAGS		(XFS_SWAP_EXT_ATTR_FORK | \
 					 XFS_SWAP_EXT_SET_SIZES | \
 					 XFS_SWAP_EXT_INO1_WRITTEN | \
 					 XFS_SWAP_EXT_CLEAR_INO1_REFLINK | \
-					 XFS_SWAP_EXT_CLEAR_INO2_REFLINK)
+					 XFS_SWAP_EXT_CLEAR_INO2_REFLINK | \
+					 XFS_SWAP_EXT_CVT_INO2_SF)
 
 #define XFS_SWAP_EXT_STRINGS \
 	{ XFS_SWAP_EXT_ATTR_FORK,		"ATTRFORK" }, \
 	{ XFS_SWAP_EXT_SET_SIZES,		"SETSIZES" }, \
 	{ XFS_SWAP_EXT_INO1_WRITTEN,		"INO1_WRITTEN" }, \
 	{ XFS_SWAP_EXT_CLEAR_INO1_REFLINK,	"CLEAR_INO1_REFLINK" }, \
-	{ XFS_SWAP_EXT_CLEAR_INO2_REFLINK,	"CLEAR_INO2_REFLINK" }
+	{ XFS_SWAP_EXT_CLEAR_INO2_REFLINK,	"CLEAR_INO2_REFLINK" }, \
+	{ XFS_SWAP_EXT_CVT_INO2_SF,		"CVT_INO2_SF" }
 
 /* This is the structure used to lay out an sxi log item in the log. */
 struct xfs_sxi_log_format {
diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index 5de586c6816..d643cb870c7 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -22,6 +22,10 @@
 #include "xfs_health.h"
 #include "defer_item.h"
 #include "xfs_errortag.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr.h"
 
 struct kmem_cache	*xfs_swapext_intent_cache;
 
@@ -110,7 +114,8 @@ static inline bool
 sxi_has_postop_work(const struct xfs_swapext_intent *sxi)
 {
 	return sxi->sxi_flags & (XFS_SWAP_EXT_CLEAR_INO1_REFLINK |
-				 XFS_SWAP_EXT_CLEAR_INO2_REFLINK);
+				 XFS_SWAP_EXT_CLEAR_INO2_REFLINK |
+				 XFS_SWAP_EXT_CVT_INO2_SF);
 }
 
 static inline void
@@ -358,6 +363,36 @@ xfs_swapext_exchange_mappings(
 	sxi_advance(sxi, irec1);
 }
 
+/* Convert inode2's leaf attr fork back to shortform, if possible.. */
+STATIC int
+xfs_swapext_attr_to_sf(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_da_args	args = {
+		.dp		= sxi->sxi_ip2,
+		.geo		= tp->t_mountp->m_attr_geo,
+		.whichfork	= XFS_ATTR_FORK,
+		.trans		= tp,
+	};
+	struct xfs_buf		*bp;
+	int			forkoff;
+	int			error;
+
+	if (!xfs_attr_is_leaf(sxi->sxi_ip2))
+		return 0;
+
+	error = xfs_attr3_leaf_read(tp, sxi->sxi_ip2, 0, &bp);
+	if (error)
+		return error;
+
+	forkoff = xfs_attr_shortform_allfit(bp, sxi->sxi_ip2);
+	if (forkoff == 0)
+		return 0;
+
+	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
+}
+
 static inline void
 xfs_swapext_clear_reflink(
 	struct xfs_trans	*tp,
@@ -375,6 +410,16 @@ xfs_swapext_do_postop_work(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
+	if (sxi->sxi_flags & XFS_SWAP_EXT_CVT_INO2_SF) {
+		int			error = 0;
+
+		if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)
+			error = xfs_swapext_attr_to_sf(tp, sxi);
+		sxi->sxi_flags &= ~XFS_SWAP_EXT_CVT_INO2_SF;
+		if (error)
+			return error;
+	}
+
 	if (sxi->sxi_flags & XFS_SWAP_EXT_CLEAR_INO1_REFLINK) {
 		xfs_swapext_clear_reflink(tp, sxi->sxi_ip1);
 		sxi->sxi_flags &= ~XFS_SWAP_EXT_CLEAR_INO1_REFLINK;
@@ -802,6 +847,8 @@ xfs_swapext_init_intent(
 
 	if (req->req_flags & XFS_SWAP_REQ_INO1_WRITTEN)
 		sxi->sxi_flags |= XFS_SWAP_EXT_INO1_WRITTEN;
+	if (req->req_flags & XFS_SWAP_REQ_CVT_INO2_SF)
+		sxi->sxi_flags |= XFS_SWAP_EXT_CVT_INO2_SF;
 
 	if (req->req_flags & XFS_SWAP_REQ_LOGGED)
 		sxi->sxi_op_flags |= XFS_SWAP_EXT_OP_LOGGED;
@@ -1021,6 +1068,8 @@ xfs_swapext(
 	ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
 	if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
 		ASSERT(req->whichfork == XFS_DATA_FORK);
+	if (req->req_flags & XFS_SWAP_REQ_CVT_INO2_SF)
+		ASSERT(req->whichfork == XFS_ATTR_FORK);
 
 	if (req->blockcount == 0)
 		return;
diff --git a/libxfs/xfs_swapext.h b/libxfs/xfs_swapext.h
index fa786bc9352..37842a4ee9a 100644
--- a/libxfs/xfs_swapext.h
+++ b/libxfs/xfs_swapext.h
@@ -180,16 +180,21 @@ struct xfs_swapext_req {
 /* Files need to be upgraded to have large extent counts. */
 #define XFS_SWAP_REQ_NREXT64		(1U << 3)
 
+/* Try to convert inode2's fork to local format, if possible. */
+#define XFS_SWAP_REQ_CVT_INO2_SF	(1U << 4)
+
 #define XFS_SWAP_REQ_FLAGS		(XFS_SWAP_REQ_LOGGED | \
 					 XFS_SWAP_REQ_SET_SIZES | \
 					 XFS_SWAP_REQ_INO1_WRITTEN | \
-					 XFS_SWAP_REQ_NREXT64)
+					 XFS_SWAP_REQ_NREXT64 | \
+					 XFS_SWAP_REQ_CVT_INO2_SF)
 
 #define XFS_SWAP_REQ_STRINGS \
 	{ XFS_SWAP_REQ_LOGGED,		"LOGGED" }, \
 	{ XFS_SWAP_REQ_SET_SIZES,	"SETSIZES" }, \
 	{ XFS_SWAP_REQ_INO1_WRITTEN,	"INO1_WRITTEN" }, \
-	{ XFS_SWAP_REQ_NREXT64,		"NREXT64" }
+	{ XFS_SWAP_REQ_NREXT64,		"NREXT64" }, \
+	{ XFS_SWAP_REQ_CVT_INO2_SF,	"CVT_INO2_SF" }
 
 unsigned int xfs_swapext_reflink_prep(const struct xfs_swapext_req *req);
 void xfs_swapext_reflink_finish(struct xfs_trans *tp,

From patchwork Sun Dec 31 22:29:23 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507886
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E49ACC129
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:29:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="TVFMHwn7"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 70408C433C8;
	Sun, 31 Dec 2023 22:29:24 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061764;
	bh=dWoZ9ckEp86/XZB2rt40VHweOj/F4QMaiz4U0stIgnQ=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=TVFMHwn7X2CbPUiiNGZ2j5FwKavTAYTakdXWryQkf2fl5FSGzpnwUC3GOnbYljMjs
	 PIhUD/jdNLX0fpW5jqnDkaD2lHq2xDlF3fp1Cp3dWXkg0Zk/DDqlrz/HmkBSd0Pyhc
	 yd74lW29iMkd1JjrLx9CKqnzPKRCUj/Ixq7G/bTOFvYKPIQ1YZjqXxdu9mhj7S8xAe
	 0weQu4gcD2cIeQiGXi5GE6h+7nKcZ2uD/sLTpb7cZDQYGpJh4oDkmEkFEFKUF8Ug3i
	 A6qqemM41W3d4XPoWW7nmSnj461Etojmfn5OUOljMGVq+E/7FhCX5q0bHsjJfiOAGb
	 3j7iQpVyF8KGw==
Date: Sun, 31 Dec 2023 14:29:23 -0800
Subject: [PATCH 09/20] xfs: condense directories after an atomic swap
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996395.1796128.8047856963725448892.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

The previous commit added a new swapext flag that enables us to perform
post-swap processing on file2 once we're done swapping the extent maps.
Now add this ability for directories.

This isn't used anywhere right now, but we need to have the basic ondisk
flags in place so that a future online directory repair feature can
create salvaged dirents in a temporary directory and swap the data forks
when ready.  If one file is in extents format and the other is inline,
we will have to promote both to extents format to perform the swap.
After the swap, we can try to condense the fixed directory down to
inline format if possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_swapext.c |   44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index d643cb870c7..c5d404cfa56 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -26,6 +26,8 @@
 #include "xfs_da_btree.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
 
 struct kmem_cache	*xfs_swapext_intent_cache;
 
@@ -393,6 +395,42 @@ xfs_swapext_attr_to_sf(
 	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
 }
 
+/* Convert inode2's block dir fork back to shortform, if possible.. */
+STATIC int
+xfs_swapext_dir_to_sf(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_da_args	args = {
+		.dp		= sxi->sxi_ip2,
+		.geo		= tp->t_mountp->m_dir_geo,
+		.whichfork	= XFS_DATA_FORK,
+		.trans		= tp,
+	};
+	struct xfs_dir2_sf_hdr	sfh;
+	struct xfs_buf		*bp;
+	bool			isblock;
+	int			size;
+	int			error;
+
+	error = xfs_dir2_isblock(&args, &isblock);
+	if (error)
+		return error;
+
+	if (!isblock)
+		return 0;
+
+	error = xfs_dir3_block_read(tp, sxi->sxi_ip2, &bp);
+	if (error)
+		return error;
+
+	size = xfs_dir2_block_sfsize(sxi->sxi_ip2, bp->b_addr, &sfh);
+	if (size > xfs_inode_data_fork_size(sxi->sxi_ip2))
+		return 0;
+
+	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
+}
+
 static inline void
 xfs_swapext_clear_reflink(
 	struct xfs_trans	*tp,
@@ -415,6 +453,8 @@ xfs_swapext_do_postop_work(
 
 		if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)
 			error = xfs_swapext_attr_to_sf(tp, sxi);
+		else if (S_ISDIR(VFS_I(sxi->sxi_ip2)->i_mode))
+			error = xfs_swapext_dir_to_sf(tp, sxi);
 		sxi->sxi_flags &= ~XFS_SWAP_EXT_CVT_INO2_SF;
 		if (error)
 			return error;
@@ -1069,7 +1109,9 @@ xfs_swapext(
 	if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
 		ASSERT(req->whichfork == XFS_DATA_FORK);
 	if (req->req_flags & XFS_SWAP_REQ_CVT_INO2_SF)
-		ASSERT(req->whichfork == XFS_ATTR_FORK);
+		ASSERT(req->whichfork == XFS_ATTR_FORK ||
+		       (req->whichfork == XFS_DATA_FORK &&
+			S_ISDIR(VFS_I(req->ip2)->i_mode)));
 
 	if (req->blockcount == 0)
 		return;

From patchwork Sun Dec 31 22:29:39 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507887
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9C810C127
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:29:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="kCjiAw18"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 1FDAFC433C8;
	Sun, 31 Dec 2023 22:29:40 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061780;
	bh=K11lKVQysg2s7uYmLtAbjoHbqUzpeJ2FAuE0YomIMo4=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=kCjiAw18iBVO50I+IgwS45PzG1I+D1Dwz1JIPFwBgq9IXKiqoFzQmweEOORY/uApK
	 NFpXEF9fQeni4q0Zl1tBm8lFWuXoJ+2ZibeDeD7DOyVRNpw2Xf25UbUjQgc6wDOm5e
	 el96ASBXcM0cfNTcPs7DUw+zrKyrr9C3FVaRhsTGEITO1w1GVHyC57/YqMHhoGNDLe
	 W4Xuxk95QvejDX0p1If2FTnkKtGmuFqlwoiibKS89ufySNc7xZt57Heyy24zgNb6T4
	 it2xgNF4tXZeaNIuigi3AprA0evBcGGJL965XZ5twvMZtawxdNZGYVYnuiku2MZxSs
	 4V+x9lZtiIuJw==
Date: Sun, 31 Dec 2023 14:29:39 -0800
Subject: [PATCH 10/20] xfs: condense symbolic links after an atomic swap
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996408.1796128.13108294692843196899.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

The previous commit added a new swapext flag that enables us to perform
post-swap processing on file2 once we're done swapping the extent maps.
Now add this ability for symlinks.

This isn't used anywhere right now, but we need to have the basic ondisk
flags in place so that a future online symlink repair feature can
salvage the remote target in a temporary link and swap the data forks
when ready.  If one file is in extents format and the other is inline,
we will have to promote both to extents format to perform the swap.
After the swap, we can try to condense the fixed symlink down to inline
format if possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_swapext.c        |   48 ++++++++++++++++++++++++++++++++++++++++++-
 libxfs/xfs_symlink_remote.c |   47 ++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfs_symlink_remote.h |    1 +
 3 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index c5d404cfa56..364ae16252d 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -28,6 +28,7 @@
 #include "xfs_attr.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dir2.h"
+#include "xfs_symlink_remote.h"
 
 struct kmem_cache	*xfs_swapext_intent_cache;
 
@@ -431,6 +432,48 @@ xfs_swapext_dir_to_sf(
 	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
 }
 
+/* Convert inode2's remote symlink target back to shortform, if possible. */
+STATIC int
+xfs_swapext_link_to_sf(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_inode		*ip = sxi->sxi_ip2;
+	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+	char				*buf;
+	int				error;
+
+	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+	    ip->i_disk_size > xfs_inode_data_fork_size(ip))
+		return 0;
+
+	/* Read the current symlink target into a buffer. */
+	buf = kmem_alloc(ip->i_disk_size + 1, KM_NOFS);
+	if (!buf) {
+		ASSERT(0);
+		return -ENOMEM;
+	}
+
+	error = xfs_symlink_remote_read(ip, buf);
+	if (error)
+		goto free;
+
+	/* Remove the blocks. */
+	error = xfs_symlink_remote_truncate(tp, ip);
+	if (error)
+		goto free;
+
+	/* Convert fork to local format and log our changes. */
+	xfs_idestroy_fork(ifp);
+	ifp->if_bytes = 0;
+	ifp->if_format = XFS_DINODE_FMT_LOCAL;
+	xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+free:
+	kmem_free(buf);
+	return error;
+}
+
 static inline void
 xfs_swapext_clear_reflink(
 	struct xfs_trans	*tp,
@@ -455,6 +498,8 @@ xfs_swapext_do_postop_work(
 			error = xfs_swapext_attr_to_sf(tp, sxi);
 		else if (S_ISDIR(VFS_I(sxi->sxi_ip2)->i_mode))
 			error = xfs_swapext_dir_to_sf(tp, sxi);
+		else if (S_ISLNK(VFS_I(sxi->sxi_ip2)->i_mode))
+			error = xfs_swapext_link_to_sf(tp, sxi);
 		sxi->sxi_flags &= ~XFS_SWAP_EXT_CVT_INO2_SF;
 		if (error)
 			return error;
@@ -1111,7 +1156,8 @@ xfs_swapext(
 	if (req->req_flags & XFS_SWAP_REQ_CVT_INO2_SF)
 		ASSERT(req->whichfork == XFS_ATTR_FORK ||
 		       (req->whichfork == XFS_DATA_FORK &&
-			S_ISDIR(VFS_I(req->ip2)->i_mode)));
+			(S_ISDIR(VFS_I(req->ip2)->i_mode) ||
+			 S_ISLNK(VFS_I(req->ip2)->i_mode))));
 
 	if (req->blockcount == 0)
 		return;
diff --git a/libxfs/xfs_symlink_remote.c b/libxfs/xfs_symlink_remote.c
index 2f3aca8d02b..a4a242bc3d4 100644
--- a/libxfs/xfs_symlink_remote.c
+++ b/libxfs/xfs_symlink_remote.c
@@ -377,3 +377,50 @@ xfs_symlink_write_target(
 	ASSERT(pathlen == 0);
 	return 0;
 }
+
+/* Remove all the blocks from a symlink and invalidate buffers. */
+int
+xfs_symlink_remote_truncate(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip)
+{
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp;
+	int			nmaps = XFS_SYMLINK_MAPS;
+	int			done = 0;
+	int			i;
+	int			error;
+
+	/* Read mappings and invalidate buffers. */
+	error = xfs_bmapi_read(ip, 0, XFS_MAX_FILEOFF, mval, &nmaps, 0);
+	if (error)
+		return error;
+
+	for (i = 0; i < nmaps; i++) {
+		if (!xfs_bmap_is_real_extent(&mval[i]))
+			break;
+
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+				XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+				XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+				&bp);
+		if (error)
+			return error;
+
+		xfs_trans_binval(tp, bp);
+	}
+
+	/* Unmap the remote blocks. */
+	error = xfs_bunmapi(tp, ip, 0, XFS_MAX_FILEOFF, 0, nmaps, &done);
+	if (error)
+		return error;
+	if (!done) {
+		ASSERT(done);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+		return -EFSCORRUPTED;
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/libxfs/xfs_symlink_remote.h b/libxfs/xfs_symlink_remote.h
index a63bd38ae4f..ac3dac8f617 100644
--- a/libxfs/xfs_symlink_remote.h
+++ b/libxfs/xfs_symlink_remote.h
@@ -22,5 +22,6 @@ int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
 int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
 		const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
 		uint resblks);
+int xfs_symlink_remote_truncate(struct xfs_trans *tp, struct xfs_inode *ip);
 
 #endif /* __XFS_SYMLINK_REMOTE_H */

From patchwork Sun Dec 31 22:29:55 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507888
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E8C35C129
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:29:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="SE/e8dcN"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id BEE86C433C8;
	Sun, 31 Dec 2023 22:29:55 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061795;
	bh=JxqMQGkLCpUi4OwwOZdzMQJ5kuFzpSMZgu1FLbHR6kM=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=SE/e8dcNq8ybJhfxRRToFk2b4m41+bZ8CkKUnBRLNBmfcbBCaVFAtX3DPpJ1rHdAC
	 bhXwmFZfzlqJ7rF1VZEKOelGw8ipJslvRiAK2ts6BGUQRP8qmuIg6huyxro3izsUK/
	 OWw5jKssFbdHJuyi3lesTq0PB0FrNSakozrE7blK7Icet/48O/Q9wgsNFcFCsYEOiW
	 brc8/+7+OJn27LHNaawC7KgjJQTXcgZK3GXlxIAg11iU3eF1pPoNn1qEBy3Q+XzKOE
	 SkL6tXmNM4p0X5SraBU0W+mlj7ZEgc82iUNwRWiCyg/mGsAeLGpvazifiPKQaDNZn8
	 kEWJkPetGa39Q==
Date: Sun, 31 Dec 2023 14:29:55 -0800
Subject: [PATCH 11/20] xfs: make atomic extent swapping support realtime files
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996422.1796128.15098682086982326791.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Now that bmap items support the realtime device, we can add the
necessary pieces to the atomic extent swapping code to support such
things.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/xfs_inode.h  |    5 ++
 libxfs/xfs_swapext.c |  165 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 161 insertions(+), 9 deletions(-)

diff --git a/include/xfs_inode.h b/include/xfs_inode.h
index bcac3a09c6b..302df4c6f7e 100644
--- a/include/xfs_inode.h
+++ b/include/xfs_inode.h
@@ -325,6 +325,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
 }
 
+static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
+{
+	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
 /* Always set the child's GID to this value, even if the parent is setgid. */
 #define CRED_FORCE_GID	(1U << 0)
 struct cred {
diff --git a/libxfs/xfs_swapext.c b/libxfs/xfs_swapext.c
index 364ae16252d..92d2f8fa133 100644
--- a/libxfs/xfs_swapext.c
+++ b/libxfs/xfs_swapext.c
@@ -29,6 +29,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_dir2.h"
 #include "xfs_symlink_remote.h"
+#include "xfs_rtbitmap.h"
 
 struct kmem_cache	*xfs_swapext_intent_cache;
 
@@ -131,6 +132,102 @@ sxi_advance(
 	sxi->sxi_blockcount -= irec->br_blockcount;
 }
 
+#ifdef DEBUG
+/*
+ * If we're going to do a BUI-only extent swap, ensure that all mappings are
+ * aligned to the realtime extent size.
+ */
+static inline int
+xfs_swapext_check_rt_extents(
+	struct xfs_mount		*mp,
+	const struct xfs_swapext_req	*req)
+{
+	struct xfs_bmbt_irec		irec1, irec2;
+	xfs_fileoff_t			startoff1 = req->startoff1;
+	xfs_fileoff_t			startoff2 = req->startoff2;
+	xfs_filblks_t			blockcount = req->blockcount;
+	uint32_t			mod;
+	int				nimaps;
+	int				error;
+
+	/* xattrs don't live on the rt device */
+	if (req->whichfork == XFS_ATTR_FORK)
+		return 0;
+
+	/*
+	 * Caller got permission to use SXI log items, so log recovery will
+	 * finish the swap and not leave us with partially swapped rt extents
+	 * exposed to userspace.
+	 */
+	if (req->req_flags & XFS_SWAP_REQ_LOGGED)
+		return 0;
+
+	/*
+	 * Allocation units must be fully mapped to a file range.  For files
+	 * with a single-fsblock allocation unit, this is trivial.
+	 */
+	if (!xfs_inode_has_bigallocunit(req->ip2))
+		return 0;
+
+	/*
+	 * For multi-fsblock allocation units, we must check the alignment of
+	 * every single mapping.
+	 */
+	while (blockcount > 0) {
+		/* Read extent from the first file */
+		nimaps = 1;
+		error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
+				&irec1, &nimaps, 0);
+		if (error)
+			return error;
+		ASSERT(nimaps == 1);
+
+		/* Read extent from the second file */
+		nimaps = 1;
+		error = xfs_bmapi_read(req->ip2, startoff2,
+				irec1.br_blockcount, &irec2, &nimaps,
+				0);
+		if (error)
+			return error;
+		ASSERT(nimaps == 1);
+
+		/*
+		 * We can only swap as many blocks as the smaller of the two
+		 * extent maps.
+		 */
+		irec1.br_blockcount = min(irec1.br_blockcount,
+					  irec2.br_blockcount);
+
+		/* Both mappings must be aligned to the realtime extent size. */
+		mod = xfs_rtb_to_rtxoff(mp, irec1.br_startoff);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		mod = xfs_rtb_to_rtxoff(mp, irec1.br_startoff);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		mod = xfs_rtb_to_rtxoff(mp, irec1.br_blockcount);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		startoff1 += irec1.br_blockcount;
+		startoff2 += irec1.br_blockcount;
+		blockcount -= irec1.br_blockcount;
+	}
+
+	return 0;
+}
+#else
+# define xfs_swapext_check_rt_extents(mp, req)		(0)
+#endif
+
 /* Check all extents to make sure we can actually swap them. */
 int
 xfs_swapext_check_extents(
@@ -150,12 +247,7 @@ xfs_swapext_check_extents(
 	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
 		return -EINVAL;
 
-	/* We don't support realtime data forks yet. */
-	if (!XFS_IS_REALTIME_INODE(req->ip1))
-		return 0;
-	if (req->whichfork == XFS_ATTR_FORK)
-		return 0;
-	return -EINVAL;
+	return xfs_swapext_check_rt_extents(mp, req);
 }
 
 #ifdef CONFIG_XFS_QUOTA
@@ -196,6 +288,8 @@ xfs_swapext_can_skip_mapping(
 	struct xfs_swapext_intent	*sxi,
 	struct xfs_bmbt_irec		*irec)
 {
+	struct xfs_mount		*mp = sxi->sxi_ip1->i_mount;
+
 	/* Do not skip this mapping if the caller did not tell us to. */
 	if (!(sxi->sxi_flags & XFS_SWAP_EXT_INO1_WRITTEN))
 		return false;
@@ -208,10 +302,63 @@ xfs_swapext_can_skip_mapping(
 	 * The mapping is unwritten or a hole.  It cannot be a delalloc
 	 * reservation because we already excluded those.  It cannot be an
 	 * unwritten extent with dirty page cache because we flushed the page
-	 * cache.  We don't support realtime files yet, so we needn't (yet)
-	 * deal with them.
+	 * cache.  For files where the allocation unit is 1FSB (files on the
+	 * data dev, rt files if the extent size is 1FSB), we can safely
+	 * skip this mapping.
 	 */
-	return true;
+	if (!xfs_inode_has_bigallocunit(sxi->sxi_ip1))
+		return true;
+
+	/*
+	 * For a realtime file with a multi-fsb allocation unit, the decision
+	 * is trickier because we can only swap full allocation units.
+	 * Unwritten mappings can appear in the middle of an rtx if the rtx is
+	 * partially written, but they can also appear for preallocations.
+	 *
+	 * If the mapping is a hole, skip it entirely.  Holes should align with
+	 * rtx boundaries.
+	 */
+	if (!xfs_bmap_is_real_extent(irec))
+		return true;
+
+	/*
+	 * All mappings below this point are unwritten.
+	 *
+	 * - If the beginning is not aligned to an rtx, trim the end of the
+	 *   mapping so that it does not cross an rtx boundary, and swap it.
+	 *
+	 * - If both ends are aligned to an rtx, skip the entire mapping.
+	 */
+	if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
+		xfs_fileoff_t	new_end;
+
+		new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
+		irec->br_blockcount = min(irec->br_blockcount,
+					  new_end - irec->br_startoff);
+		return false;
+	}
+	if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
+		return true;
+
+	/*
+	 * All mappings below this point are unwritten, start on an rtx
+	 * boundary, and do not end on an rtx boundary.
+	 *
+	 * - If the mapping is longer than one rtx, trim the end of the mapping
+	 *   down to an rtx boundary and skip it.
+	 *
+	 * - The mapping is shorter than one rtx.  Swap it.
+	 */
+	if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
+		xfs_fileoff_t	new_end;
+
+		new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
+				mp->m_sb.sb_rextsize);
+		irec->br_blockcount = new_end - irec->br_startoff;
+		return true;
+	}
+
+	return false;
 }
 
 /*

From patchwork Sun Dec 31 22:30:10 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507889
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 054F1C2CC
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:30:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="NC8CWzku"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 71A26C433C7;
	Sun, 31 Dec 2023 22:30:11 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061811;
	bh=Tmw+RrEhWqmyAfmYca5Gbb6Byuuz8dJ/cVkGVJfGYRQ=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=NC8CWzkuJBCbWTmyVIxdfRgphRBwWlCnt5QBTnA45t0S+raPvYoMdcUxzJWHzsa12
	 PMXoU6UqbfpcJ6+IiaBYKp8KYMb7n8jDo7ZB1qm3BxJEXC3qx8VnWIgg7EUrhmw+vg
	 xk2yv7Wxb1ox+EwiAdmN+yKzn4NTvBxIduyk6+4dI0UBJAkz2tcmx5l3/cKL5GWTaL
	 3IfP5vOd5VZz+Xicx9UMeKXzhcrnPnobPkKS1bvI2eeLfKr34EiY3SBlGoJ7WCcMmV
	 0wsgofvHcNdgRYqNlae1r+4yRQXJIFlrQMZDuTFqCrJf5fCHk8TzZonwzE30CCsvzt
	 xoZBs5o13zODA==
Date: Sun, 31 Dec 2023 14:30:10 -0800
Subject: [PATCH 12/20] xfs: enable atomic swapext feature
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996435.1796128.504794578530787665.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add the atomic swapext feature to the set of features that we will
permit.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 libxfs/xfs_format.h |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libxfs/xfs_format.h b/libxfs/xfs_format.h
index 8b34754a579..7861539ab8b 100644
--- a/libxfs/xfs_format.h
+++ b/libxfs/xfs_format.h
@@ -398,7 +398,8 @@ xfs_sb_has_incompat_feature(
  */
 #define XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT  (1U << 31)
 #define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
-	(XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
+		(XFS_SB_FEAT_INCOMPAT_LOG_XATTRS | \
+		 XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT)
 #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_LOG_ALL
 static inline bool
 xfs_sb_has_incompat_log_feature(

From patchwork Sun Dec 31 22:30:26 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507890
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8F15EC8CA
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:30:27 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="HMTvWYtG"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 178F7C433C8;
	Sun, 31 Dec 2023 22:30:27 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061827;
	bh=+9lcWJz+lQJaSyRDLxcZVTu1yMluLChCQdu0hwIlZ1E=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=HMTvWYtGQYXyVerLcUXYr9xke+pdMZtpePHvI/XLup/FvvJORxCrqPVTZY8jtmb9S
	 uxGdIB3aSOf+kab/hy46V7Nq3O8Pg6IXOQ4fqpgzuqAgti02RyvYhzymuYEA3Z4w1E
	 VRfYnwvdlPqZdgEHspwpom3B8JktXO7zahYJQUkEYbV07UUzccTXCrjqhgUdtfeHsz
	 clxmtLG+qF4Nbp0RF6itUbbZx0nekUrZSY1nR9L352eCkhnvdX0WYKZGcyhgYsEePX
	 RYNAJ0ezYK1kPCKw97ZJiUPD2S7Vzf3RCKSTu8QMMF5JNbFyqqh/pjQbuFKYkSqG/j
	 Xpw8++XAjgBVw==
Date: Sun, 31 Dec 2023 14:30:26 -0800
Subject: [PATCH 13/20] libhandle: add support for bulkstat v5
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996449.1796128.13856662543334507066.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add support to libhandle for generating file handles with bulkstat v5
structures.  xfs_fsr will need this to be able to interface with the new
vfs range swap ioctl, and other client programs will probably want this
over time.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/jdm.h   |   24 +++++++++++
 libhandle/jdm.c |  117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)

diff --git a/include/jdm.h b/include/jdm.h
index c57fcae7fca..445737a6b5f 100644
--- a/include/jdm.h
+++ b/include/jdm.h
@@ -11,6 +11,7 @@ typedef void	jdm_fshandle_t;		/* filesystem handle */
 typedef void	jdm_filehandle_t;	/* filehandle */
 
 struct xfs_bstat;
+struct xfs_bulkstat;
 struct attrlist_cursor;
 struct parent;
 
@@ -23,6 +24,9 @@ jdm_new_filehandle( jdm_filehandle_t **handlep,	/* new filehandle */
 		    jdm_fshandle_t *fshandlep,	/* filesystem filehandle */
 		    struct xfs_bstat *sp);	/* bulkstat info */
 
+extern void jdm_new_filehandle_v5(jdm_filehandle_t **handlep, size_t *hlen,
+		jdm_fshandle_t *fshandlep, struct xfs_bulkstat *sp);
+
 extern void
 jdm_delete_filehandle( jdm_filehandle_t *handlep,/* filehandle to delete */
 		       size_t hlen);		/* filehandle size */
@@ -32,35 +36,55 @@ jdm_open( jdm_fshandle_t *fshandlep,
 	  struct xfs_bstat *sp,
 	  intgen_t oflags);
 
+extern intgen_t jdm_open_v5(jdm_fshandle_t *fshandlep, struct xfs_bulkstat *sp,
+		intgen_t oflags);
+
 extern intgen_t
 jdm_readlink( jdm_fshandle_t *fshandlep,
 	      struct xfs_bstat *sp,
 	      char *bufp,
 	      size_t bufsz);
 
+extern intgen_t jdm_readlink_v5(jdm_fshandle_t *fshandlep,
+		struct xfs_bulkstat *sp, char *bufp, size_t bufsz);
+
 extern intgen_t
 jdm_attr_multi(	jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
 		char *bufp, int rtrvcnt, int flags);
 
+extern intgen_t jdm_attr_multi_v5(jdm_fshandle_t *fshp,
+		struct xfs_bulkstat *statp, char *bufp, int rtrvcnt,
+		int flags);
+
 extern intgen_t
 jdm_attr_list(	jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
 		char *bufp, size_t bufsz, int flags,
 		struct attrlist_cursor *cursor);
 
+extern intgen_t jdm_attr_list_v5(jdm_fshandle_t *fshp,
+		struct xfs_bulkstat *statp, char *bufp, size_t bufsz, int
+		flags, struct attrlist_cursor *cursor);
+
 extern int
 jdm_parents( jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
 		struct parent *bufp, size_t bufsz,
 		unsigned int *count);
 
+extern int jdm_parents_v5(jdm_fshandle_t *fshp, struct xfs_bulkstat *statp,
+		struct parent *bufp, size_t bufsz, unsigned int *count);
+
 extern int
 jdm_parentpaths( jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
 		struct parent *bufp, size_t bufsz,
 		unsigned int *count);
 
+extern int jdm_parentpaths_v5(jdm_fshandle_t *fshp, struct xfs_bulkstat *statp,
+		struct parent *bufp, size_t bufsz, unsigned int *count);
+
 /* macro for determining the size of a structure member */
 #define sizeofmember( t, m )	sizeof( ( ( t * )0 )->m )
 
diff --git a/libhandle/jdm.c b/libhandle/jdm.c
index 07b0c60985e..e21aff2b2c1 100644
--- a/libhandle/jdm.c
+++ b/libhandle/jdm.c
@@ -41,6 +41,19 @@ jdm_fill_filehandle( filehandle_t *handlep,
 	handlep->fh_ino = statp->bs_ino;
 }
 
+static void
+jdm_fill_filehandle_v5(
+	struct filehandle	*handlep,
+	struct fshandle		*fshandlep,
+	struct xfs_bulkstat	*statp)
+{
+	handlep->fh_fshandle = *fshandlep;
+	handlep->fh_sz_following = FILEHANDLE_SZ_FOLLOWING;
+	memset(handlep->fh_pad, 0, FILEHANDLE_SZ_PAD);
+	handlep->fh_gen = statp->bs_gen;
+	handlep->fh_ino = statp->bs_ino;
+}
+
 jdm_fshandle_t *
 jdm_getfshandle( char *mntpnt )
 {
@@ -90,6 +103,22 @@ jdm_new_filehandle( jdm_filehandle_t **handlep,
 		jdm_fill_filehandle(*handlep, (fshandle_t *) fshandlep, statp);
 }
 
+void
+jdm_new_filehandle_v5(
+	jdm_filehandle_t	**handlep,
+	size_t			*hlen,
+	jdm_fshandle_t		*fshandlep,
+	struct xfs_bulkstat	*statp)
+{
+	/* allocate and fill filehandle */
+	*hlen = sizeof(filehandle_t);
+	*handlep = (filehandle_t *) malloc(*hlen);
+	if (!*handlep)
+		return;
+
+	jdm_fill_filehandle_v5(*handlep, (struct fshandle *)fshandlep, statp);
+}
+
 /* ARGSUSED */
 void
 jdm_delete_filehandle( jdm_filehandle_t *handlep, size_t hlen )
@@ -111,6 +140,19 @@ jdm_open( jdm_fshandle_t *fshp, struct xfs_bstat *statp, intgen_t oflags )
 	return fd;
 }
 
+intgen_t
+jdm_open_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	intgen_t		oflags)
+{
+	struct fshandle		*fshandlep = (struct fshandle *)fshp;
+	struct filehandle	filehandle;
+
+	jdm_fill_filehandle_v5(&filehandle, fshandlep, statp);
+	return open_by_fshandle(&filehandle, sizeof(filehandle), oflags);
+}
+
 intgen_t
 jdm_readlink( jdm_fshandle_t *fshp,
 	      struct xfs_bstat *statp,
@@ -128,6 +170,20 @@ jdm_readlink( jdm_fshandle_t *fshp,
 	return rval;
 }
 
+intgen_t
+jdm_readlink_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	char			*bufp,
+	size_t			bufsz)
+{
+	struct fshandle		*fshandlep = (struct fshandle *)fshp;
+	struct filehandle	filehandle;
+
+	jdm_fill_filehandle_v5(&filehandle, fshandlep, statp);
+	return readlink_by_handle(&filehandle, sizeof(filehandle), bufp, bufsz);
+}
+
 int
 jdm_attr_multi(	jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
@@ -145,6 +201,22 @@ jdm_attr_multi(	jdm_fshandle_t *fshp,
 	return rval;
 }
 
+int
+jdm_attr_multi_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	char			*bufp,
+	int			rtrvcnt,
+	int			flags)
+{
+	struct fshandle		*fshandlep = (struct fshandle *)fshp;
+	struct filehandle	filehandle;
+
+	jdm_fill_filehandle_v5(&filehandle, fshandlep, statp);
+	return attr_multi_by_handle(&filehandle, sizeof(filehandle), bufp,
+			rtrvcnt, flags);
+}
+
 int
 jdm_attr_list(	jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
@@ -166,6 +238,27 @@ jdm_attr_list(	jdm_fshandle_t *fshp,
 	return rval;
 }
 
+int
+jdm_attr_list_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	char			*bufp,
+	size_t			bufsz,
+	int			flags,
+	struct attrlist_cursor	*cursor)
+{
+	struct fshandle		*fshandlep = (struct fshandle *)fshp;
+	struct filehandle	filehandle;
+
+	/* prevent needless EINVAL from the kernel */
+	if (bufsz > XFS_XATTR_LIST_MAX)
+		bufsz = XFS_XATTR_LIST_MAX;
+
+	jdm_fill_filehandle_v5(&filehandle, fshandlep, statp);
+	return attr_list_by_handle(&filehandle, sizeof(filehandle), bufp,
+			bufsz, flags, cursor);
+}
+
 int
 jdm_parents( jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
@@ -176,6 +269,18 @@ jdm_parents( jdm_fshandle_t *fshp,
 	return -1;
 }
 
+int
+jdm_parents_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	struct parent		*bufp,
+	size_t			bufsz,
+	unsigned int		*count)
+{
+	errno = EOPNOTSUPP;
+	return -1;
+}
+
 int
 jdm_parentpaths( jdm_fshandle_t *fshp,
 		struct xfs_bstat *statp,
@@ -185,3 +290,15 @@ jdm_parentpaths( jdm_fshandle_t *fshp,
 	errno = EOPNOTSUPP;
 	return -1;
 }
+
+int
+jdm_parentpaths_v5(
+	jdm_fshandle_t		*fshp,
+	struct xfs_bulkstat	*statp,
+	struct parent		*bufp,
+	size_t			bufsz,
+	unsigned int		*count)
+{
+	errno = EOPNOTSUPP;
+	return -1;
+}

From patchwork Sun Dec 31 22:30:42 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507891
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 37FACC8C8
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:30:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="A4ehAoW6"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id B8B20C433C8;
	Sun, 31 Dec 2023 22:30:42 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061842;
	bh=KbI2zZj+pu4iq7XYYj1vMzH7fiOD8HnB1h1tXFvANPg=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=A4ehAoW6VQpdBVCI8Vwf84PbaBicvKJ73A/piHEV4L1qg3o7xDWP3hC9DgFAnQ7bH
	 glRCFdikr4lnu37Hx5xKeWQwOprcj42akdsy3LdFrG8imkOhEiYhGvLM96V58SD17x
	 AJFO37TjleGWXyy+MS0ZBKIuTs9AQbehZZsId8FbjqUXGpW9GVBJPgSV/+KDIPvZ1C
	 rXlqPR0O7MQkvSOxKO8dkp5Vr0gdRNl1ohj15p2XYYS+mj/yfcV4BqMkkiapLCcM66
	 vr41TxxfeRaShNDUOBnKaPzo2YyAcmnbsggf0pzkA5J+b+lEzboe6brK6rjNSkwKcO
	 QTDOLTarEGJ2g==
Date: Sun, 31 Dec 2023 14:30:42 -0800
Subject: [PATCH 14/20] libfrog: convert xfs_io swapext command to use new
 libfrog wrapper
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996462.1796128.4278002541789320850.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Create an abstraction layer for the two swapext ioctls and port xfs_io
to use it.  Now we're insulated from the differences between the XFS v0
ioctl and the new vfs ioctl.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/swapext.c            |   54 ++++++++-------
 libfrog/Makefile        |    2 +
 libfrog/file_exchange.c |  169 +++++++++++++++++++++++++++++++++++++++++++++++
 libfrog/file_exchange.h |   14 ++++
 libfrog/fsgeom.h        |    6 ++
 5 files changed, 218 insertions(+), 27 deletions(-)
 create mode 100644 libfrog/file_exchange.c
 create mode 100644 libfrog/file_exchange.h

diff --git a/io/swapext.c b/io/swapext.c
index a4153bb7d42..15ed3559398 100644
--- a/io/swapext.c
+++ b/io/swapext.c
@@ -10,7 +10,7 @@
 #include "io.h"
 #include "libfrog/logging.h"
 #include "libfrog/fsgeom.h"
-#include "libfrog/bulkstat.h"
+#include "libfrog/file_exchange.h"
 
 static cmdinfo_t swapext_cmd;
 
@@ -28,47 +28,47 @@ swapext_f(
 	int			argc,
 	char			**argv)
 {
-	struct xfs_fd		fxfd = XFS_FD_INIT(file->fd);
-	struct xfs_bulkstat	bulkstat;
-	int			fd;
-	int			error;
-	struct xfs_swapext	sx;
+	struct xfs_fd		xfd = XFS_FD_INIT(file->fd);
+	struct xfs_exch_range	fxr;
 	struct stat		stat;
+	uint64_t		flags = XFS_EXCH_RANGE_FILE2_FRESH |
+					XFS_EXCH_RANGE_FULL_FILES;
+	int			fd;
+	int			ret;
 
 	/* open the donor file */
 	fd = openfile(argv[1], NULL, 0, 0, NULL);
 	if (fd < 0)
 		return 0;
 
-	/*
-	 * stat the target file to get the inode number and use the latter to
-	 * get the bulkstat info for the swapext cmd.
-	 */
-	error = fstat(file->fd, &stat);
-	if (error) {
+	ret = -xfd_prepare_geometry(&xfd);
+	if (ret) {
+		xfrog_perror(ret, "xfd_prepare_geometry");
+		exitcode = 1;
+		goto out;
+	}
+
+	ret = fstat(file->fd, &stat);
+	if (ret) {
 		perror("fstat");
+		exitcode = 1;
 		goto out;
 	}
 
-	error = -xfrog_bulkstat_single(&fxfd, stat.st_ino, 0, &bulkstat);
-	if (error) {
-		xfrog_perror(error, "bulkstat");
+	ret = xfrog_file_exchange_prep(&xfd, flags, 0, fd, 0, stat.st_size,
+			&fxr);
+	if (ret) {
+		xfrog_perror(ret, "xfrog_file_exchange_prep");
+		exitcode = 1;
 		goto out;
 	}
-	error = -xfrog_bulkstat_v5_to_v1(&fxfd, &sx.sx_stat, &bulkstat);
-	if (error) {
-		xfrog_perror(error, "bulkstat conversion");
+
+	ret = xfrog_file_exchange(&xfd, &fxr);
+	if (ret) {
+		xfrog_perror(ret, "swapext");
+		exitcode = 1;
 		goto out;
 	}
-	sx.sx_version = XFS_SX_VERSION;
-	sx.sx_fdtarget = file->fd;
-	sx.sx_fdtmp = fd;
-	sx.sx_offset = 0;
-	sx.sx_length = stat.st_size;
-	error = ioctl(file->fd, XFS_IOC_SWAPEXT, &sx);
-	if (error)
-		perror("swapext");
-
 out:
 	close(fd);
 	return 0;
diff --git a/libfrog/Makefile b/libfrog/Makefile
index dcfd1fb8a93..f8bb39f2712 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -18,6 +18,7 @@ bitmap.c \
 bulkstat.c \
 convert.c \
 crc32.c \
+file_exchange.c \
 fsgeom.c \
 list_sort.c \
 linux.c \
@@ -42,6 +43,7 @@ crc32defs.h \
 crc32table.h \
 dahashselftest.h \
 div64.h \
+file_exchange.h \
 fsgeom.h \
 logging.h \
 paths.h \
diff --git a/libfrog/file_exchange.c b/libfrog/file_exchange.c
new file mode 100644
index 00000000000..4a66aa752fc
--- /dev/null
+++ b/libfrog/file_exchange.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <string.h>
+#include "xfs.h"
+#include "fsgeom.h"
+#include "bulkstat.h"
+#include "file_exchange.h"
+
+/* Prepare the freshness component of a swapext request. */
+static int
+xfrog_file_exchange_prep_freshness(
+	struct xfs_fd		*dest,
+	struct xfs_exch_range	*req)
+{
+	struct stat		stat;
+	struct xfs_bulkstat	bulkstat;
+	int			error;
+
+	error = fstat(dest->fd, &stat);
+	if (error)
+		return -errno;
+	req->file2_ino = stat.st_ino;
+
+	/*
+	 * Try to fill out the [cm]time data from bulkstat.  We prefer this
+	 * approach because bulkstat v5 gives us 64-bit time even on 32-bit.
+	 *
+	 * However, we'll take our chances on the C library if the filesystem
+	 * supports 64-bit time but we ended up with bulkstat v5 emulation.
+	 */
+	error = xfrog_bulkstat_single(dest, stat.st_ino, 0, &bulkstat);
+	if (!error &&
+	    !((dest->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_BIGTIME) &&
+	      bulkstat.bs_version < XFS_BULKSTAT_VERSION_V5)) {
+		req->file2_mtime = bulkstat.bs_mtime;
+		req->file2_ctime = bulkstat.bs_ctime;
+		req->file2_mtime_nsec = bulkstat.bs_mtime_nsec;
+		req->file2_ctime_nsec = bulkstat.bs_ctime_nsec;
+		return 0;
+	}
+
+	/* Otherwise, use the stat information and hope for the best. */
+	req->file2_mtime = stat.st_mtime;
+	req->file2_ctime = stat.st_ctime;
+	req->file2_mtime_nsec = stat.st_mtim.tv_nsec;
+	req->file2_ctime_nsec = stat.st_ctim.tv_nsec;
+	return 0;
+}
+
+/* Prepare an extent swap request. */
+int
+xfrog_file_exchange_prep(
+	struct xfs_fd		*dest,
+	uint64_t		flags,
+	int64_t			file2_offset,
+	int			file1_fd,
+	int64_t			file1_offset,
+	int64_t			length,
+	struct xfs_exch_range	*req)
+{
+	memset(req, 0, sizeof(*req));
+	req->file1_fd = file1_fd;
+	req->file1_offset = file1_offset;
+	req->length = length;
+	req->file2_offset = file2_offset;
+	req->flags = flags;
+
+	if (flags & XFS_EXCH_RANGE_FILE2_FRESH)
+		return xfrog_file_exchange_prep_freshness(dest, req);
+
+	return 0;
+}
+
+/* Swap two files' extents with the new exchange range ioctl. */
+static int
+xfrog_file_exchange_range(
+	struct xfs_fd		*xfd,
+	struct xfs_exch_range	*req)
+{
+	int			ret;
+
+	ret = ioctl(xfd->fd, XFS_IOC_EXCHANGE_RANGE, req);
+	if (ret) {
+		/* the old swapext ioctl returned EFAULT for bad length */
+		if (errno == EDOM)
+			return -EFAULT;
+		return -errno;
+	}
+	return 0;
+}
+
+/*
+ * The old swapext ioctl did not provide atomic swap; it required that the
+ * supplied offset and length matched both files' lengths; and it also required
+ * that the sx_stat information match the dest file.  It doesn't support any
+ * other flags.
+ */
+#define XFS_EXCH_RANGE_SWAPEXT	(XFS_EXCH_RANGE_NONATOMIC | \
+				 XFS_EXCH_RANGE_FULL_FILES | \
+				 XFS_EXCH_RANGE_FILE2_FRESH)
+
+/* Swap two files' extents with the old xfs swapext ioctl. */
+static int
+xfrog_file_exchange_swapext(
+	struct xfs_fd		*xfd,
+	struct xfs_exch_range	*req)
+{
+	struct xfs_swapext	sx = {
+		.sx_version	= XFS_SX_VERSION,
+		.sx_fdtarget	= xfd->fd,
+		.sx_fdtmp	= req->file1_fd,
+		.sx_length	= req->length,
+	};
+	int			ret;
+
+	if (req->file1_offset != req->file2_offset)
+		return -EINVAL;
+	if (req->flags != XFS_EXCH_RANGE_SWAPEXT)
+		return -EOPNOTSUPP;
+
+	sx.sx_stat.bs_ino = req->file2_ino;
+	sx.sx_stat.bs_ctime.tv_sec = req->file2_ctime;
+	sx.sx_stat.bs_ctime.tv_nsec = req->file2_ctime_nsec;
+	sx.sx_stat.bs_mtime.tv_sec = req->file2_mtime;
+	sx.sx_stat.bs_mtime.tv_nsec = req->file2_mtime_nsec;
+
+	ret = ioctl(xfd->fd, XFS_IOC_SWAPEXT, &sx);
+	if (ret)
+		return -errno;
+	return 0;
+}
+
+/* Swap extents between an XFS file and a donor fd. */
+int
+xfrog_file_exchange(
+	struct xfs_fd		*xfd,
+	struct xfs_exch_range	*req)
+{
+	int			error;
+
+	if (xfd->flags & XFROG_FLAG_FORCE_SWAPEXT)
+		goto try_swapext;
+
+	error = xfrog_file_exchange_range(xfd, req);
+	if ((error != -ENOTTY && error != -EOPNOTSUPP) ||
+	    (xfd->flags & XFROG_FLAG_FORCE_EXCH_RANGE))
+		return error;
+
+	/*
+	 * If the new exchange range ioctl wasn't found, punt to the old
+	 * swapext ioctl.
+	 */
+	switch (error) {
+	case -EOPNOTSUPP:
+	case -ENOTTY:
+		xfd->flags |= XFROG_FLAG_FORCE_SWAPEXT;
+		break;
+	}
+
+try_swapext:
+	return xfrog_file_exchange_swapext(xfd, req);
+}
diff --git a/libfrog/file_exchange.h b/libfrog/file_exchange.h
new file mode 100644
index 00000000000..7b6ce11810b
--- /dev/null
+++ b/libfrog/file_exchange.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All rights reserved.
+ * All Rights Reserved.
+ */
+#ifndef __LIBFROG_FILE_EXCHANGE_H__
+#define __LIBFROG_FILE_EXCHANGE_H__
+
+int xfrog_file_exchange_prep(struct xfs_fd *file2, uint64_t flags,
+		int64_t file2_offset, int file1_fd, int64_t file1_offset,
+		int64_t length, struct xfs_exch_range *req);
+int xfrog_file_exchange(struct xfs_fd *xfd, struct xfs_exch_range *req);
+
+#endif	/* __LIBFROG_FILE_EXCHANGE_H__ */
diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h
index ca38324e853..2ff748caaf4 100644
--- a/libfrog/fsgeom.h
+++ b/libfrog/fsgeom.h
@@ -50,6 +50,12 @@ struct xfs_fd {
 /* Only use v5 bulkstat/inumbers ioctls. */
 #define XFROG_FLAG_BULKSTAT_FORCE_V5	(1 << 1)
 
+/* Only use XFS_IOC_SWAPEXT for file data exchanges. */
+#define XFROG_FLAG_FORCE_SWAPEXT	(1 << 2)
+
+/* Only use XFS_IOC_EXCHANGE_RANGE for file data exchanges. */
+#define XFROG_FLAG_FORCE_EXCH_RANGE	(1 << 3)
+
 /* Static initializers */
 #define XFS_FD_INIT(_fd)	{ .fd = (_fd), }
 #define XFS_FD_INIT_EMPTY	XFS_FD_INIT(-1)

From patchwork Sun Dec 31 22:30:57 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507892
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E4BDCC8C8
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:30:58 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Ngfjfs/C"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 765B6C433C8;
	Sun, 31 Dec 2023 22:30:58 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061858;
	bh=LS9EeKH1LyN2PoHSOlGOZNdF2qy/kZ9oK2/N/zFYqkA=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Ngfjfs/CC7vxdMu8WLn7UWT7SQYtKzdZt1+cR9WpQvAmhu1H7JYig7phtTaVyiEM4
	 5ZWwr3DwE1ohvzNLhBkhclj32S4PKObtyuZuBLwKYxvda/SXbg6Ita/Gz4nBUZClvm
	 WndH6CGrpUzcvu4nijAtnicZUG1RJ5hjMTXVh8CJYIXOYJ7bVelwvZxQrp2aFDAYmb
	 sjYVC1YH1AtOAVYzDad1vLmjACd3wtABQDtFJX+gG9Kdhh7dPQXs7Yam0vZkqeOd4C
	 GSOZ/KoAVqDmbT8rQM0zkPX5W1qOjGiflO1XVZgHIoWApH2TTlxPzdH5x/8xAD9Q5p
	 ZRKORKZgI/w3Q==
Date: Sun, 31 Dec 2023 14:30:57 -0800
Subject: [PATCH 15/20] xfs_logprint: support dumping swapext log items
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996475.1796128.2613882333794931172.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Support dumping swapext log items.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 logprint/log_misc.c      |   11 ++++
 logprint/log_print_all.c |   12 ++++
 logprint/log_redo.c      |  128 ++++++++++++++++++++++++++++++++++++++++++++++
 logprint/logprint.h      |    6 ++
 4 files changed, 157 insertions(+)

diff --git a/logprint/log_misc.c b/logprint/log_misc.c
index 836156e0d58..565e7b76284 100644
--- a/logprint/log_misc.c
+++ b/logprint/log_misc.c
@@ -1052,6 +1052,17 @@ xlog_print_record(
 					be32_to_cpu(op_head->oh_len));
 			break;
 		    }
+		    case XFS_LI_SXI: {
+			skip = xlog_print_trans_sxi(&ptr,
+					be32_to_cpu(op_head->oh_len),
+					continued);
+			break;
+		    }
+		    case XFS_LI_SXD: {
+			skip = xlog_print_trans_sxd(&ptr,
+					be32_to_cpu(op_head->oh_len));
+			break;
+		    }
 		    case XFS_LI_QUOTAOFF: {
 			skip = xlog_print_trans_qoff(&ptr,
 					be32_to_cpu(op_head->oh_len));
diff --git a/logprint/log_print_all.c b/logprint/log_print_all.c
index 8d3ede190e5..6e528fcd097 100644
--- a/logprint/log_print_all.c
+++ b/logprint/log_print_all.c
@@ -440,6 +440,12 @@ xlog_recover_print_logitem(
 	case XFS_LI_BUI:
 		xlog_recover_print_bui(item);
 		break;
+	case XFS_LI_SXD:
+		xlog_recover_print_sxd(item);
+		break;
+	case XFS_LI_SXI:
+		xlog_recover_print_sxi(item);
+		break;
 	case XFS_LI_DQUOT:
 		xlog_recover_print_dquot(item);
 		break;
@@ -498,6 +504,12 @@ xlog_recover_print_item(
 	case XFS_LI_BUI:
 		printf("BUI");
 		break;
+	case XFS_LI_SXD:
+		printf("SXD");
+		break;
+	case XFS_LI_SXI:
+		printf("SXI");
+		break;
 	case XFS_LI_DQUOT:
 		printf("DQ ");
 		break;
diff --git a/logprint/log_redo.c b/logprint/log_redo.c
index edf7e0fbfa9..770485df75d 100644
--- a/logprint/log_redo.c
+++ b/logprint/log_redo.c
@@ -847,3 +847,131 @@ xlog_recover_print_attrd(
 		f->alfd_size,
 		(unsigned long long)f->alfd_alf_id);
 }
+
+/* Atomic Extent Swapping Items */
+
+static int
+xfs_sxi_copy_format(
+	struct xfs_sxi_log_format *sxi,
+	uint			  len,
+	struct xfs_sxi_log_format *dst_fmt,
+	int			  continued)
+{
+	if (len == sizeof(struct xfs_sxi_log_format) || continued) {
+		memcpy(dst_fmt, sxi, len);
+		return 0;
+	}
+	fprintf(stderr, _("%s: bad size of SXI format: %u; expected %zu\n"),
+		progname, len, sizeof(struct xfs_sxi_log_format));
+	return 1;
+}
+
+int
+xlog_print_trans_sxi(
+	char			**ptr,
+	uint			src_len,
+	int			continued)
+{
+	struct xfs_sxi_log_format *src_f, *f = NULL;
+	struct xfs_swap_extent	*ex;
+	int			error = 0;
+
+	src_f = malloc(src_len);
+	if (src_f == NULL) {
+		fprintf(stderr, _("%s: %s: malloc failed\n"),
+			progname, __func__);
+		exit(1);
+	}
+	memcpy(src_f, *ptr, src_len);
+	*ptr += src_len;
+
+	/* convert to native format */
+	if (continued && src_len < sizeof(struct xfs_sxi_log_format)) {
+		printf(_("SXI: Not enough data to decode further\n"));
+		error = 1;
+		goto error;
+	}
+
+	f = malloc(sizeof(struct xfs_sxi_log_format));
+	if (f == NULL) {
+		fprintf(stderr, _("%s: %s: malloc failed\n"),
+			progname, __func__);
+		exit(1);
+	}
+	if (xfs_sxi_copy_format(src_f, src_len, f, continued)) {
+		error = 1;
+		goto error;
+	}
+
+	printf(_("SXI:  #regs: %d	num_extents: 1  id: 0x%llx\n"),
+		f->sxi_size, (unsigned long long)f->sxi_id);
+
+	if (continued) {
+		printf(_("SXI extent data skipped (CONTINUE set, no space)\n"));
+		goto error;
+	}
+
+	ex = &f->sxi_extent;
+	printf("(ino1: 0x%llx, ino2: 0x%llx, off1: %lld, off2: %lld, len: %lld, flags: 0x%llx)\n",
+		(unsigned long long)ex->sx_inode1,
+		(unsigned long long)ex->sx_inode2,
+		(unsigned long long)ex->sx_startoff1,
+		(unsigned long long)ex->sx_startoff2,
+		(unsigned long long)ex->sx_blockcount,
+		(unsigned long long)ex->sx_flags);
+error:
+	free(src_f);
+	free(f);
+	return error;
+}
+
+void
+xlog_recover_print_sxi(
+	struct xlog_recover_item	*item)
+{
+	char				*src_f;
+	uint				src_len;
+
+	src_f = item->ri_buf[0].i_addr;
+	src_len = item->ri_buf[0].i_len;
+
+	xlog_print_trans_sxi(&src_f, src_len, 0);
+}
+
+int
+xlog_print_trans_sxd(
+	char				**ptr,
+	uint				len)
+{
+	struct xfs_sxd_log_format	*f;
+	struct xfs_sxd_log_format	lbuf;
+
+	/* size without extents at end */
+	uint core_size = sizeof(struct xfs_sxd_log_format);
+
+	memcpy(&lbuf, *ptr, min(core_size, len));
+	f = &lbuf;
+	*ptr += len;
+	if (len >= core_size) {
+		printf(_("SXD:  #regs: %d	                 id: 0x%llx\n"),
+			f->sxd_size,
+			(unsigned long long)f->sxd_sxi_id);
+
+		/* don't print extents as they are not used */
+
+		return 0;
+	} else {
+		printf(_("SXD: Not enough data to decode further\n"));
+		return 1;
+	}
+}
+
+void
+xlog_recover_print_sxd(
+	struct xlog_recover_item	*item)
+{
+	char				*f;
+
+	f = item->ri_buf[0].i_addr;
+	xlog_print_trans_sxd(&f, sizeof(struct xfs_sxd_log_format));
+}
diff --git a/logprint/logprint.h b/logprint/logprint.h
index b4479c240d9..892b280b548 100644
--- a/logprint/logprint.h
+++ b/logprint/logprint.h
@@ -65,4 +65,10 @@ extern void xlog_recover_print_attri(struct xlog_recover_item *item);
 extern int xlog_print_trans_attrd(char **ptr, uint len);
 extern void xlog_recover_print_attrd(struct xlog_recover_item *item);
 extern void xlog_print_op_header(xlog_op_header_t *op_head, int i, char **ptr);
+
+extern int xlog_print_trans_sxi(char **ptr, uint src_len, int continued);
+extern void xlog_recover_print_sxi(struct xlog_recover_item *item);
+extern int xlog_print_trans_sxd(char **ptr, uint len);
+extern void xlog_recover_print_sxd(struct xlog_recover_item *item);
+
 #endif	/* LOGPRINT_H */

From patchwork Sun Dec 31 22:31:13 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507893
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 89E62C8CB
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:31:14 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="qRIAYy3A"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 17D66C433C8;
	Sun, 31 Dec 2023 22:31:14 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061874;
	bh=TM9NXqiMHDSBaX9QW4FIpK2KcrK+zbON4AXSzj9JQxE=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=qRIAYy3AyWxeJO6aZFZXhickcLPrZDnGv71PlvzjerqObfmNXIJWFAa16IWKxUeNS
	 MwAhrjBPcHYH95rLHA0tB3XCiRWG2P+VNG5KFLehF7dD93z90usprB6kTzXa+RGjMS
	 aVFUo0wyMAXHRM6WCYcqQqPE0mWFzo5hy7Z2CQLWH/eqhkPUyn6yZUrpbdUGTg5Oa6
	 Rzz3hB9su1yRmi18MxTXbdeZITYmEFpCPI3OMbQ8EsIyRcJGkGiBeHpsX4wRyZ9RF+
	 A3c0B2GRW9XJ4kQ3IJdVOtGb7m/TKoCmDyQc8d5kMNVRHB9GsaoB6vQdG4KQuMiW5V
	 ZjjPCdBK+J0Fg==
Date: Sun, 31 Dec 2023 14:31:13 -0800
Subject: [PATCH 16/20] xfs_fsr: convert to bulkstat v5 ioctls
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996489.1796128.15788565040447636713.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Now that libhandle can, er, handle bulkstat information coming from the
v5 bulkstat ioctl, port xfs_fsr to use the new interfaces instead of
repeatedly converting things back and forth.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fsr/xfs_fsr.c    |  148 ++++++++++++++++++++++++++++++------------------------
 libfrog/fsgeom.c |   45 ++++++++++++----
 libfrog/fsgeom.h |    1 
 3 files changed, 117 insertions(+), 77 deletions(-)

diff --git a/fsr/xfs_fsr.c b/fsr/xfs_fsr.c
index ba02506d8e4..8e916faee94 100644
--- a/fsr/xfs_fsr.c
+++ b/fsr/xfs_fsr.c
@@ -65,10 +65,10 @@ static int	pagesize;
 
 void usage(int ret);
 static int  fsrfile(char *fname, xfs_ino_t ino);
-static int  fsrfile_common( char *fname, char *tname, char *mnt,
-                            int fd, struct xfs_bstat *statp);
-static int  packfile(char *fname, char *tname, int fd,
-                     struct xfs_bstat *statp, struct fsxattr *fsxp);
+static int  fsrfile_common(char *fname, char *tname, char *mnt,
+			   struct xfs_fd *file_fd, struct xfs_bulkstat *statp);
+static int  packfile(char *fname, char *tname, struct xfs_fd *file_fd,
+                     struct xfs_bulkstat *statp, struct fsxattr *fsxp);
 static void fsrdir(char *dirname);
 static int  fsrfs(char *mntdir, xfs_ino_t ino, int targetrange);
 static void initallfs(char *mtab);
@@ -80,7 +80,7 @@ int xfs_getrt(int fd, struct statvfs *sfbp);
 char * gettmpname(char *fname);
 char * getparent(char *fname);
 int fsrprintf(const char *fmt, ...);
-int read_fd_bmap(int, struct xfs_bstat *, int *);
+int read_fd_bmap(int, struct xfs_bulkstat *, int *);
 static void tmp_init(char *mnt);
 static char * tmp_next(char *mnt);
 static void tmp_close(char *mnt);
@@ -102,6 +102,26 @@ static int	nfrags = 0;	/* Debug option: Coerse into specific number
 				 * of extents */
 static int	openopts = O_CREAT|O_EXCL|O_RDWR|O_DIRECT;
 
+/*
+ * Open a file on an XFS filesystem from file handle components and fs geometry
+ * data.  Returns zero or a negative error code.
+ */
+static int
+open_handle(
+	struct xfs_fd		*xfd,
+	jdm_fshandle_t		*fshandle,
+	struct xfs_bulkstat	*bulkstat,
+	struct xfs_fsop_geom	*fsgeom,
+	int			flags)
+{
+	xfd->fd = jdm_open_v5(fshandle, bulkstat, flags);
+	if (xfd->fd < 0)
+		return errno;
+
+	xfd_install_geometry(xfd, fsgeom);
+	return 0;
+}
+
 static int
 xfs_swapext(int fd, xfs_swapext_t *sx)
 {
@@ -600,7 +620,6 @@ static int
 fsrfs(char *mntdir, xfs_ino_t startino, int targetrange)
 {
 	struct xfs_fd	fsxfd = XFS_FD_INIT_EMPTY;
-	int	fd;
 	int	count = 0;
 	int	ret;
 	char	fname[64];
@@ -638,10 +657,10 @@ fsrfs(char *mntdir, xfs_ino_t startino, int targetrange)
 	}
 
 	while ((ret = -xfrog_bulkstat(&fsxfd, breq) == 0)) {
-		struct xfs_bstat	bs1;
 		struct xfs_bulkstat	*buf = breq->bulkstat;
 		struct xfs_bulkstat	*p;
 		struct xfs_bulkstat	*endp;
+		struct xfs_fd		file_fd = XFS_FD_INIT_EMPTY;
 		uint32_t		buflenout = breq->hdr.ocount;
 
 		if (buflenout == 0)
@@ -658,15 +677,9 @@ fsrfs(char *mntdir, xfs_ino_t startino, int targetrange)
 			     (p->bs_extents64 < 2))
 				continue;
 
-			ret = -xfrog_bulkstat_v5_to_v1(&fsxfd, &bs1, p);
+			ret = open_handle(&file_fd, fshandlep, p,
+					&fsxfd.fsgeom, O_RDWR | O_DIRECT);
 			if (ret) {
-				fsrprintf(_("bstat conversion error: %s\n"),
-						strerror(ret));
-				continue;
-			}
-
-			fd = jdm_open(fshandlep, &bs1, O_RDWR | O_DIRECT);
-			if (fd < 0) {
 				/* This probably means the file was
 				 * removed while in progress of handling
 				 * it.  Just quietly ignore this file.
@@ -683,11 +696,12 @@ fsrfs(char *mntdir, xfs_ino_t startino, int targetrange)
 			/* Get a tmp file name */
 			tname = tmp_next(mntdir);
 
-			ret = fsrfile_common(fname, tname, mntdir, fd, &bs1);
+			ret = fsrfile_common(fname, tname, mntdir, &file_fd,
+					p);
 
 			leftoffino = p->bs_ino;
 
-			close(fd);
+			xfd_close(&file_fd);
 
 			if (ret == 0) {
 				if (--count <= 0)
@@ -735,9 +749,8 @@ fsrfile(
 {
 	struct xfs_fd		fsxfd = XFS_FD_INIT_EMPTY;
 	struct xfs_bulkstat	bulkstat;
-	struct xfs_bstat	statbuf;
+	struct xfs_fd		file_fd = XFS_FD_INIT_EMPTY;
 	jdm_fshandle_t		*fshandlep;
-	int			fd = -1;
 	int			error = -1;
 	char			*tname;
 
@@ -765,17 +778,12 @@ fsrfile(
 			fname, strerror(error));
 		goto out;
 	}
-	error = -xfrog_bulkstat_v5_to_v1(&fsxfd, &statbuf, &bulkstat);
-	if (error) {
-		fsrprintf(_("bstat conversion error on %s: %s\n"),
-			fname, strerror(error));
-		goto out;
-	}
 
-	fd = jdm_open(fshandlep, &statbuf, O_RDWR|O_DIRECT);
-	if (fd < 0) {
+	error = open_handle(&file_fd, fshandlep, &bulkstat, &fsxfd.fsgeom,
+			O_RDWR | O_DIRECT);
+	if (error) {
 		fsrprintf(_("unable to open handle %s: %s\n"),
-			fname, strerror(errno));
+			fname, strerror(error));
 		goto out;
 	}
 
@@ -783,14 +791,13 @@ fsrfile(
 	memcpy(&fsgeom, &fsxfd.fsgeom, sizeof(fsgeom));
 
 	tname = gettmpname(fname);
-
 	if (tname)
-		error = fsrfile_common(fname, tname, NULL, fd, &statbuf);
+		error = fsrfile_common(fname, tname, NULL, &file_fd,
+				&bulkstat);
 
 out:
 	xfd_close(&fsxfd);
-	if (fd >= 0)
-		close(fd);
+	xfd_close(&file_fd);
 	free(fshandlep);
 
 	return error;
@@ -816,8 +823,8 @@ fsrfile_common(
 	char		*fname,
 	char		*tname,
 	char		*fsname,
-	int		fd,
-	struct xfs_bstat *statp)
+	struct xfs_fd	*file_fd,
+	struct xfs_bulkstat *statp)
 {
 	int		error;
 	struct statvfs  vfss;
@@ -827,7 +834,7 @@ fsrfile_common(
 	if (vflag)
 		fsrprintf("%s\n", fname);
 
-	if (fsync(fd) < 0) {
+	if (fsync(file_fd->fd) < 0) {
 		fsrprintf(_("sync failed: %s: %s\n"), fname, strerror(errno));
 		return -1;
 	}
@@ -851,7 +858,7 @@ fsrfile_common(
 		fl.l_whence = SEEK_SET;
 		fl.l_start = (off_t)0;
 		fl.l_len = 0;
-		if ((fcntl(fd, F_GETLK, &fl)) < 0 ) {
+		if ((fcntl(file_fd->fd, F_GETLK, &fl)) < 0 ) {
 			if (vflag)
 				fsrprintf(_("locking check failed: %s\n"),
 					fname);
@@ -869,7 +876,7 @@ fsrfile_common(
 	/*
 	 * Check if there is room to copy the file.
 	 *
-	 * Note that xfs_bstat.bs_blksize returns the filesystem blocksize,
+	 * Note that xfs_bulkstat.bs_blksize returns the filesystem blocksize,
 	 * not the optimal I/O size as struct stat.
 	 */
 	if (statvfs(fsname ? fsname : fname, &vfss) < 0) {
@@ -886,7 +893,7 @@ fsrfile_common(
 		return 1;
 	}
 
-	if ((ioctl(fd, FS_IOC_FSGETXATTR, &fsx)) < 0) {
+	if ((ioctl(file_fd->fd, FS_IOC_FSGETXATTR, &fsx)) < 0) {
 		fsrprintf(_("failed to get inode attrs: %s\n"), fname);
 		return(-1);
 	}
@@ -902,7 +909,7 @@ fsrfile_common(
 		return(0);
 	}
 	if (fsx.fsx_xflags & FS_XFLAG_REALTIME) {
-		if (xfs_getrt(fd, &vfss) < 0) {
+		if (xfs_getrt(file_fd->fd, &vfss) < 0) {
 			fsrprintf(_("cannot get realtime geometry for: %s\n"),
 				fname);
 			return(-1);
@@ -928,7 +935,7 @@ fsrfile_common(
 	 * file we're defragging, in packfile().
 	 */
 
-	if ((error = packfile(fname, tname, fd, statp, &fsx)))
+	if ((error = packfile(fname, tname, file_fd, statp, &fsx)))
 		return error;
 	return -1; /* no error */
 }
@@ -952,7 +959,7 @@ static int
 fsr_setup_attr_fork(
 	int		fd,
 	int		tfd,
-	struct xfs_bstat *bstatp)
+	struct xfs_bulkstat *bstatp)
 {
 #ifdef HAVE_FSETXATTR
 	struct xfs_fd	txfd = XFS_FD_INIT(tfd);
@@ -1136,23 +1143,28 @@ fsr_setup_attr_fork(
  *  1: No change / No Error
  */
 static int
-packfile(char *fname, char *tname, int fd,
-	 struct xfs_bstat *statp, struct fsxattr *fsxp)
+packfile(
+	char			*fname,
+	char			*tname,
+	struct xfs_fd		*file_fd,
+	struct xfs_bulkstat	*statp,
+	struct fsxattr		*fsxp)
 {
-	int 		tfd = -1;
-	int		srval;
-	int		retval = -1;	/* Failure is the default */
-	int		nextents, extent, cur_nextents, new_nextents;
-	unsigned	blksz_dio;
-	unsigned	dio_min;
-	struct dioattr	dio;
-	static xfs_swapext_t   sx;
-	struct xfs_flock64  space;
-	off64_t 	cnt, pos;
-	void 		*fbuf = NULL;
-	int 		ct, wc, wc_b4;
-	char		ffname[SMBUFSZ];
-	int		ffd = -1;
+	int			tfd = -1;
+	int			srval;
+	int			retval = -1;	/* Failure is the default */
+	int			nextents, extent, cur_nextents, new_nextents;
+	unsigned		blksz_dio;
+	unsigned		dio_min;
+	struct dioattr		dio;
+	static xfs_swapext_t	sx;
+	struct xfs_flock64	space;
+	off64_t			cnt, pos;
+	void			*fbuf = NULL;
+	int			ct, wc, wc_b4;
+	char			ffname[SMBUFSZ];
+	int			ffd = -1;
+	int			error;
 
 	/*
 	 * Work out the extent map - nextents will be set to the
@@ -1160,7 +1172,7 @@ packfile(char *fname, char *tname, int fd,
 	 * into account holes), cur_nextents is the current number
 	 * of extents.
 	 */
-	nextents = read_fd_bmap(fd, statp, &cur_nextents);
+	nextents = read_fd_bmap(file_fd->fd, statp, &cur_nextents);
 
 	if (cur_nextents == 1 || cur_nextents <= nextents) {
 		if (vflag)
@@ -1183,7 +1195,7 @@ packfile(char *fname, char *tname, int fd,
 	unlink(tname);
 
 	/* Setup extended attributes */
-	if (fsr_setup_attr_fork(fd, tfd, statp) != 0) {
+	if (fsr_setup_attr_fork(file_fd->fd, tfd, statp) != 0) {
 		fsrprintf(_("failed to set ATTR fork on tmp: %s:\n"), tname);
 		goto out;
 	}
@@ -1301,7 +1313,7 @@ packfile(char *fname, char *tname, int fd,
 				   tname, strerror(errno));
 				goto out;
 			}
-			if (lseek(fd, outmap[extent].bmv_length, SEEK_CUR) < 0) {
+			if (lseek(file_fd->fd, outmap[extent].bmv_length, SEEK_CUR) < 0) {
 				fsrprintf(_("could not lseek in file: %s : %s\n"),
 				   fname, strerror(errno));
 				goto out;
@@ -1321,7 +1333,7 @@ packfile(char *fname, char *tname, int fd,
 				ct = min(cnt + dio_min - (cnt % dio_min),
 					blksz_dio);
 			}
-			ct = read(fd, fbuf, ct);
+			ct = read(file_fd->fd, fbuf, ct);
 			if (ct == 0) {
 				/* EOF, stop trying to read */
 				extent = nextents;
@@ -1392,9 +1404,15 @@ packfile(char *fname, char *tname, int fd,
 		goto out;
 	}
 
-	sx.sx_stat     = *statp; /* struct copy */
+	error = -xfrog_bulkstat_v5_to_v1(file_fd, &sx.sx_stat, statp);
+	if (error) {
+		fsrprintf(_("bstat conversion error on %s: %s\n"),
+				fname, strerror(error));
+		goto out;
+	}
+
 	sx.sx_version  = XFS_SX_VERSION;
-	sx.sx_fdtarget = fd;
+	sx.sx_fdtarget = file_fd->fd;
 	sx.sx_fdtmp    = tfd;
 	sx.sx_offset   = 0;
 	sx.sx_length   = statp->bs_size;
@@ -1408,7 +1426,7 @@ packfile(char *fname, char *tname, int fd,
         }
 
 	/* Swap the extents */
-	srval = xfs_swapext(fd, &sx);
+	srval = xfs_swapext(file_fd->fd, &sx);
 	if (srval < 0) {
 		if (errno == ENOTSUP) {
 			if (vflag || dflag)
@@ -1504,7 +1522,7 @@ getparent(char *fname)
 #define MAPSIZE	128
 #define	OUTMAP_SIZE_INCREMENT	MAPSIZE
 
-int	read_fd_bmap(int fd, struct xfs_bstat *sin, int *cur_nextents)
+int	read_fd_bmap(int fd, struct xfs_bulkstat *sin, int *cur_nextents)
 {
 	int		i, cnt;
 	struct getbmap	map[MAPSIZE];
diff --git a/libfrog/fsgeom.c b/libfrog/fsgeom.c
index 3e7f0797d8b..6980d3ffab6 100644
--- a/libfrog/fsgeom.c
+++ b/libfrog/fsgeom.c
@@ -102,29 +102,50 @@ xfrog_geometry(
 	return -errno;
 }
 
-/*
- * Prepare xfs_fd structure for future ioctl operations by computing the xfs
- * geometry for @xfd->fd.  Returns zero or a negative error code.
- */
-int
-xfd_prepare_geometry(
+/* Compute conversion factors of an xfs_fd structure. */
+static void
+xfd_compute_conversion_factors(
 	struct xfs_fd		*xfd)
 {
-	int			ret;
-
-	ret = xfrog_geometry(xfd->fd, &xfd->fsgeom);
-	if (ret)
-		return ret;
-
 	xfd->agblklog = log2_roundup(xfd->fsgeom.agblocks);
 	xfd->blocklog = highbit32(xfd->fsgeom.blocksize);
 	xfd->inodelog = highbit32(xfd->fsgeom.inodesize);
 	xfd->inopblog = xfd->blocklog - xfd->inodelog;
 	xfd->aginolog = xfd->agblklog + xfd->inopblog;
 	xfd->blkbb_log = xfd->blocklog - BBSHIFT;
+}
+
+/*
+ * Prepare xfs_fd structure for future ioctl operations by computing the xfs
+ * geometry for @xfd->fd.  Returns zero or a negative error code.
+ */
+int
+xfd_prepare_geometry(
+	struct xfs_fd		*xfd)
+{
+	int			ret;
+
+	ret = xfrog_geometry(xfd->fd, &xfd->fsgeom);
+	if (ret)
+		return ret;
+
+	xfd_compute_conversion_factors(xfd);
 	return 0;
 }
 
+/*
+ * Prepare xfs_fd structure for future ioctl operations by computing the xfs
+ * geometry for @xfd->fd.  Returns zero or a negative error code.
+ */
+void
+xfd_install_geometry(
+	struct xfs_fd		*xfd,
+	struct xfs_fsop_geom	*fsgeom)
+{
+	memcpy(&xfd->fsgeom, fsgeom, sizeof(*fsgeom));
+	xfd_compute_conversion_factors(xfd);
+}
+
 /* Open a file on an XFS filesystem.  Returns zero or a negative error code. */
 int
 xfd_open(
diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h
index 2ff748caaf4..7e002c5137a 100644
--- a/libfrog/fsgeom.h
+++ b/libfrog/fsgeom.h
@@ -61,6 +61,7 @@ struct xfs_fd {
 #define XFS_FD_INIT_EMPTY	XFS_FD_INIT(-1)
 
 int xfd_prepare_geometry(struct xfs_fd *xfd);
+void xfd_install_geometry(struct xfs_fd *xfd, struct xfs_fsop_geom *fsgeom);
 int xfd_open(struct xfs_fd *xfd, const char *pathname, int flags);
 int xfd_close(struct xfs_fd *xfd);
 

From patchwork Sun Dec 31 22:31:29 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507894
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4A4E0C8CA
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:31:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="HopOfADA"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C93F4C433C7;
	Sun, 31 Dec 2023 22:31:29 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061889;
	bh=dfpbXgnawP+eC94JS7M3yTs20cXzv3aaDRcKn3DO+9g=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=HopOfADAMnX8Z6KblX74XFVfNR/KXNl7YqAHbC6sfZtxANUs2TNSjS/6gM7Scev+I
	 ZpfR6NVX3Jj2bkjwZmDjxfrB3fWgihmdxTMPwjq5wjM01qYzAdebkDcc0DytmoLceF
	 nvhu1944QHbnq6Jes6v/esI5eHm/vJJISpeLsPPPLqBxd/jWNs8I2XBK3Y784JQhBo
	 ArvbYFfIM2XLBAkC9/RLSPZ/TftAZgr5wFspFlkg+bmuBOMEKzTF7/Ulant91DdAQh
	 WEFYjOu3xAiml01zNZFlpmkYl1YgFVtOsD9R3uWjEzkJ70NXvFHse+rfGHPmYDY8Z2
	 9ifTDxApJ4I4A==
Date: Sun, 31 Dec 2023 14:31:29 -0800
Subject: [PATCH 17/20] xfs_fsr: port to new swapext library function
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996502.1796128.17245080944113896708.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Port fsr to use the new libfrog library functions to handle swapext.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fsr/xfs_fsr.c           |   79 +++++++++++++++++++++++------------------------
 libfrog/file_exchange.c |   17 ++++++++++
 libfrog/file_exchange.h |    2 +
 3 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/fsr/xfs_fsr.c b/fsr/xfs_fsr.c
index 8e916faee94..37cacffa0fd 100644
--- a/fsr/xfs_fsr.c
+++ b/fsr/xfs_fsr.c
@@ -13,6 +13,7 @@
 #include "libfrog/paths.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
+#include "libfrog/file_exchange.h"
 
 #include <fcntl.h>
 #include <errno.h>
@@ -122,12 +123,6 @@ open_handle(
 	return 0;
 }
 
-static int
-xfs_swapext(int fd, xfs_swapext_t *sx)
-{
-    return ioctl(fd, XFS_IOC_SWAPEXT, sx);
-}
-
 static int
 xfs_fscounts(int fd, xfs_fsop_counts_t *counts)
 {
@@ -1150,14 +1145,13 @@ packfile(
 	struct xfs_bulkstat	*statp,
 	struct fsxattr		*fsxp)
 {
+	struct xfs_exch_range	fxr;
 	int			tfd = -1;
-	int			srval;
 	int			retval = -1;	/* Failure is the default */
 	int			nextents, extent, cur_nextents, new_nextents;
 	unsigned		blksz_dio;
 	unsigned		dio_min;
 	struct dioattr		dio;
-	static xfs_swapext_t	sx;
 	struct xfs_flock64	space;
 	off64_t			cnt, pos;
 	void			*fbuf = NULL;
@@ -1194,6 +1188,20 @@ packfile(
 	}
 	unlink(tname);
 
+	/*
+	 * Set up everything in the swap request except for the destination
+	 * freshness check, which we'll do separately since we already have
+	 * a bulkstat.
+	 */
+	error = xfrog_file_exchange_prep(file_fd,
+			XFS_EXCH_RANGE_NONATOMIC | XFS_EXCH_RANGE_FULL_FILES,
+			0, tfd, 0, statp->bs_size, &fxr);
+	if (error) {
+		fsrprintf(_("error %d setting up swapext request\n"), error);
+		goto out;
+	}
+	xfrog_file_exchange_require_file2_fresh(&fxr, statp);
+
 	/* Setup extended attributes */
 	if (fsr_setup_attr_fork(file_fd->fd, tfd, statp) != 0) {
 		fsrprintf(_("failed to set ATTR fork on tmp: %s:\n"), tname);
@@ -1404,19 +1412,6 @@ packfile(
 		goto out;
 	}
 
-	error = -xfrog_bulkstat_v5_to_v1(file_fd, &sx.sx_stat, statp);
-	if (error) {
-		fsrprintf(_("bstat conversion error on %s: %s\n"),
-				fname, strerror(error));
-		goto out;
-	}
-
-	sx.sx_version  = XFS_SX_VERSION;
-	sx.sx_fdtarget = file_fd->fd;
-	sx.sx_fdtmp    = tfd;
-	sx.sx_offset   = 0;
-	sx.sx_length   = statp->bs_size;
-
 	/* switch to the owner's id, to keep quota in line */
         if (fchown(tfd, statp->bs_uid, statp->bs_gid) < 0) {
                 if (vflag)
@@ -1426,25 +1421,29 @@ packfile(
         }
 
 	/* Swap the extents */
-	srval = xfs_swapext(file_fd->fd, &sx);
-	if (srval < 0) {
-		if (errno == ENOTSUP) {
-			if (vflag || dflag)
-			   fsrprintf(_("%s: file type not supported\n"), fname);
-		} else if (errno == EFAULT) {
-			/* The file has changed since we started the copy */
-			if (vflag || dflag)
-			   fsrprintf(_("%s: file modified defrag aborted\n"),
-				     fname);
-		} else if (errno == EBUSY) {
-			/* Timestamp has changed or mmap'ed file */
-			if (vflag || dflag)
-			   fsrprintf(_("%s: file busy\n"), fname);
-		} else {
-			fsrprintf(_("XFS_IOC_SWAPEXT failed: %s: %s\n"),
-				  fname, strerror(errno));
-		}
-		goto out;
+	error = xfrog_file_exchange(file_fd, &fxr);
+	switch (error) {
+		case 0:
+			break;
+	case ENOTSUP:
+		if (vflag || dflag)
+			fsrprintf(_("%s: file type not supported\n"), fname);
+		break;
+	case EFAULT:
+	case EDOM:
+		/* The file has changed since we started the copy */
+		if (vflag || dflag)
+			fsrprintf(_("%s: file modified defrag aborted\n"),
+					fname);
+		break;
+	case EBUSY:
+		/* Timestamp has changed or mmap'ed file */
+		if (vflag || dflag)
+			fsrprintf(_("%s: file busy\n"), fname);
+		break;
+	default:
+		fsrprintf(_("XFS_IOC_SWAPEXT failed: %s: %s\n"),
+			  fname, strerror(error));
 	}
 
 	/* Report progress */
diff --git a/libfrog/file_exchange.c b/libfrog/file_exchange.c
index 4a66aa752fc..5a527489aa5 100644
--- a/libfrog/file_exchange.c
+++ b/libfrog/file_exchange.c
@@ -54,6 +54,23 @@ xfrog_file_exchange_prep_freshness(
 	return 0;
 }
 
+/*
+ * Enable checking that the target (or destination) file has not been modified
+ * since a particular point in time.
+ */
+void
+xfrog_file_exchange_require_file2_fresh(
+	struct xfs_exch_range	*req,
+	struct xfs_bulkstat	*bulkstat)
+{
+	req->flags |= XFS_EXCH_RANGE_FILE2_FRESH;
+	req->file2_ino = bulkstat->bs_ino;
+	req->file2_mtime = bulkstat->bs_mtime;
+	req->file2_ctime = bulkstat->bs_ctime;
+	req->file2_mtime_nsec = bulkstat->bs_mtime_nsec;
+	req->file2_ctime_nsec = bulkstat->bs_ctime_nsec;
+}
+
 /* Prepare an extent swap request. */
 int
 xfrog_file_exchange_prep(
diff --git a/libfrog/file_exchange.h b/libfrog/file_exchange.h
index 7b6ce11810b..63dedf46a2f 100644
--- a/libfrog/file_exchange.h
+++ b/libfrog/file_exchange.h
@@ -6,6 +6,8 @@
 #ifndef __LIBFROG_FILE_EXCHANGE_H__
 #define __LIBFROG_FILE_EXCHANGE_H__
 
+void xfrog_file_exchange_require_file2_fresh(struct xfs_exch_range *req,
+		struct xfs_bulkstat *bulkstat);
 int xfrog_file_exchange_prep(struct xfs_fd *file2, uint64_t flags,
 		int64_t file2_offset, int file1_fd, int64_t file1_offset,
 		int64_t length, struct xfs_exch_range *req);

From patchwork Sun Dec 31 22:31:44 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507895
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E6DC4C13B
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:31:45 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="DX1qTj+2"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6F0DCC433C7;
	Sun, 31 Dec 2023 22:31:45 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061905;
	bh=LiPYPf6gUsjIR+DIG7G8lhKTVR6EdOPCuRSUUdBpg7o=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=DX1qTj+2FxbKnMSJNXEn+T1xzZmuGWN8Pe2eC9IV0AFn8ezaCGJGVJNbZS1V/ZDVF
	 lBkhsEl38Ao5585uZjGfWjK77ZQGuiIfeJ0gba7/5xqXJzcXlSWUMd4PEt0mgsjsee
	 ZPTNhWmPZ3Yxo35lWCiQAp6lTryxjFnoecCks0T8Fykkd/X50mjNJGFrQS6H2tkyWM
	 rry+w3ZXF8RZhu+P27ejEEobs2/LhaF1/V1yWv31vE5I6csSWJjtvmKQcM4QE+tAYj
	 nptSYVV7oyeaXrI5h+679orv2IJGkOjKZ6r1nljv4V8q0ImloGgeVUJmu8HqDh/ugs
	 sNILoHT3O+CdA==
Date: Sun, 31 Dec 2023 14:31:44 -0800
Subject: [PATCH 18/20] xfs_fsr: skip the xattr/forkoff levering with the newer
 swapext implementations
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996515.1796128.9395428698927356944.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

The newer swapext implementations in the kernel run at a high enough
level (above the bmap layer) that it's no longer required to manipulate
bs_forkoff by creating garbage xattrs to get the extent tree that we
want.  If we detect the newer algorithms, skip this error prone step.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fsr/xfs_fsr.c |   16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fsr/xfs_fsr.c b/fsr/xfs_fsr.c
index 37cacffa0fd..44fc46dd2b1 100644
--- a/fsr/xfs_fsr.c
+++ b/fsr/xfs_fsr.c
@@ -968,6 +968,22 @@ fsr_setup_attr_fork(
 	if (!(bstatp->bs_xflags & FS_XFLAG_HASATTR))
 		return 0;
 
+	/*
+	 * If the filesystem has the ability to perform atomic extent swaps or
+	 * has the reverse mapping btree enabled, the file extent swap
+	 * implementation uses a higher level algorithm that calls into the
+	 * bmap code instead of playing games with swapping the extent forks.
+	 *
+	 * The newer bmap implementation does not require specific values of
+	 * bs_forkoff, unlike the old fork swap code.  Therefore, leave the
+	 * extended attributes alone if we know we're not using the old fork
+	 * swap strategy.  This eliminates a major source of runtime errors
+	 * in fsr.
+	 */
+	if (fsgeom.flags & (XFS_FSOP_GEOM_FLAGS_ATOMIC_SWAP |
+			    XFS_FSOP_GEOM_FLAGS_RMAPBT))
+		return 0;
+
 	/*
 	 * use the old method if we have attr1 or the kernel does not yet
 	 * support passing the fork offset in the bulkstat data.

From patchwork Sun Dec 31 22:32:00 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507896
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3DA89C129
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:32:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="WhYALYnR"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id EF27CC433C7;
	Sun, 31 Dec 2023 22:32:00 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061921;
	bh=oCOg6QfsFByCYHOQkqnLHyQjSSY3wS050n/cHbDwxrY=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=WhYALYnREN8gBL0J20PT0n7oyvlOxtriHL3G886RUvM4JQH0E1z+r2G8sY5lIZ+tv
	 ol4kmpw30sYRbIUiv6otxsAqS27yKO+hXvOlL/ICSI5nal+np/A8+Q6rpKpObNk02b
	 Oj3zYODW7RtpTtc64hDyWOhZ9ki8GPwX6fIYyKN7Yx/iPutGqlp8dERT16bk5IIBC/
	 MorsT+vQQqqXhuAW32hNQM2QZOdFvwnzjH9kqp5ATWcLziaGQI8ZxvB2MXr5Vgq7t5
	 PfSZxiKWyEUO+IJUyNxlOPkP+rULZhw0B1hrtypFZJqU0ZHEshJ3fIQSLBawyBda59
	 pyiDpaH1HtC8Q==
Date: Sun, 31 Dec 2023 14:32:00 -0800
Subject: [PATCH 19/20] xfs_io: enhance swapext to take advantage of new api
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996528.1796128.648560765928226673.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Enhance the swapext command so that we can take advantage of the new
API's features and print some timing information.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/swapext.c      |  156 +++++++++++++++++++++++++++++++++++++++++++++++++----
 man/man8/xfs_io.8 |   54 ++++++++++++++++++
 2 files changed, 197 insertions(+), 13 deletions(-)

diff --git a/io/swapext.c b/io/swapext.c
index 15ed3559398..22476ec7563 100644
--- a/io/swapext.c
+++ b/io/swapext.c
@@ -20,7 +20,36 @@ swapext_help(void)
 	printf(_(
 "\n"
 " Swaps extents between the open file descriptor and the supplied filename.\n"
-"\n"));
+"\n"
+" -a   -- Use atomic extent swapping\n"
+" -C   -- Print timing information in a condensed format\n"
+" -d N -- Start swapping extents at this offset in the open file\n"
+" -e   -- Swap extents to the ends of both files, including the file sizes\n"
+" -f   -- Flush changed file data and metadata to disk\n"
+" -h   -- Only swap written ranges in the supplied file\n"
+" -l N -- Swap this many bytes between the two files\n"
+" -n   -- Dry run; do all the parameter validation but do not change anything.\n"
+" -s N -- Start swapping extents at this offset in the supplied file\n"
+" -t   -- Print timing information\n"
+" -u   -- Do not compare the open file's timestamps\n"
+" -v   -- 'swapext' for XFS_IOC_SWAPEXT, or 'exchrange' for XFS_IOC_EXCHANGE_RANGE\n"));
+}
+
+static void
+set_xfd_flags(
+	struct xfs_fd	*xfd,
+	int		api_ver)
+{
+	switch (api_ver) {
+	case 0:
+		xfd->flags |= XFROG_FLAG_FORCE_SWAPEXT;
+		break;
+	case 1:
+		xfd->flags |= XFROG_FLAG_FORCE_EXCH_RANGE;
+		break;
+	default:
+		break;
+	}
 }
 
 static int
@@ -31,13 +60,101 @@ swapext_f(
 	struct xfs_fd		xfd = XFS_FD_INIT(file->fd);
 	struct xfs_exch_range	fxr;
 	struct stat		stat;
-	uint64_t		flags = XFS_EXCH_RANGE_FILE2_FRESH |
+	struct timeval		t1, t2;
+	uint64_t		flags = XFS_EXCH_RANGE_NONATOMIC |
+					XFS_EXCH_RANGE_FILE2_FRESH |
 					XFS_EXCH_RANGE_FULL_FILES;
+	int64_t			src_offset = 0;
+	int64_t			dest_offset = 0;
+	int64_t			length = -1;
+	size_t			fsblocksize, fssectsize;
+	int			condensed = 0, quiet_flag = 1;
+	int			api_ver = -1;
+	int			c;
 	int			fd;
 	int			ret;
 
+	init_cvtnum(&fsblocksize, &fssectsize);
+	while ((c = getopt(argc, argv, "Cad:efhl:ns:tuv:")) != -1) {
+		switch (c) {
+		case 'C':
+			condensed = 1;
+			break;
+		case 'a':
+			flags &= ~XFS_EXCH_RANGE_NONATOMIC;
+			break;
+		case 'd':
+			dest_offset = cvtnum(fsblocksize, fssectsize, optarg);
+			if (dest_offset < 0) {
+				printf(
+			_("non-numeric open file offset argument -- %s\n"),
+						optarg);
+				return 0;
+			}
+			flags &= ~XFS_EXCH_RANGE_FULL_FILES;
+			break;
+		case 'e':
+			flags |= XFS_EXCH_RANGE_TO_EOF;
+			flags &= ~XFS_EXCH_RANGE_FULL_FILES;
+			break;
+		case 'f':
+			flags |= XFS_EXCH_RANGE_FSYNC;
+			break;
+		case 'h':
+			flags |= XFS_EXCH_RANGE_FILE1_WRITTEN;
+			break;
+		case 'l':
+			length = cvtnum(fsblocksize, fssectsize, optarg);
+			if (length < 0) {
+				printf(
+			_("non-numeric length argument -- %s\n"),
+						optarg);
+				return 0;
+			}
+			flags &= ~XFS_EXCH_RANGE_FULL_FILES;
+			break;
+		case 'n':
+			flags |= XFS_EXCH_RANGE_DRY_RUN;
+			break;
+		case 's':
+			src_offset = cvtnum(fsblocksize, fssectsize, optarg);
+			if (src_offset < 0) {
+				printf(
+			_("non-numeric supplied file offset argument -- %s\n"),
+						optarg);
+				return 0;
+			}
+			flags &= ~XFS_EXCH_RANGE_FULL_FILES;
+			break;
+		case 't':
+			quiet_flag = 0;
+			break;
+		case 'u':
+			flags &= ~XFS_EXCH_RANGE_FILE2_FRESH;
+			break;
+		case 'v':
+			if (!strcmp(optarg, "swapext"))
+				api_ver = 0;
+			else if (!strcmp(optarg, "exchrange"))
+				api_ver = 1;
+			else {
+				fprintf(stderr,
+			_("version must be 'swapext' or 'exchrange'.\n"));
+				return 1;
+			}
+			break;
+		default:
+			swapext_help();
+			return 0;
+		}
+	}
+	if (optind != argc - 1) {
+		swapext_help();
+		return 0;
+	}
+
 	/* open the donor file */
-	fd = openfile(argv[1], NULL, 0, 0, NULL);
+	fd = openfile(argv[optind], NULL, 0, 0, NULL);
 	if (fd < 0)
 		return 0;
 
@@ -48,27 +165,42 @@ swapext_f(
 		goto out;
 	}
 
-	ret = fstat(file->fd, &stat);
-	if (ret) {
-		perror("fstat");
-		exitcode = 1;
-		goto out;
+	if (length < 0) {
+		ret = fstat(file->fd, &stat);
+		if (ret) {
+			perror("fstat");
+			exitcode = 1;
+			goto out;
+		}
+
+		length = stat.st_size;
 	}
 
-	ret = xfrog_file_exchange_prep(&xfd, flags, 0, fd, 0, stat.st_size,
-			&fxr);
+	ret = xfrog_file_exchange_prep(&xfd, flags, dest_offset, fd, src_offset,
+			length, &fxr);
 	if (ret) {
 		xfrog_perror(ret, "xfrog_file_exchange_prep");
 		exitcode = 1;
 		goto out;
 	}
 
+	set_xfd_flags(&xfd, api_ver);
+
+	gettimeofday(&t1, NULL);
 	ret = xfrog_file_exchange(&xfd, &fxr);
 	if (ret) {
 		xfrog_perror(ret, "swapext");
 		exitcode = 1;
 		goto out;
 	}
+	if (quiet_flag)
+		goto out;
+
+	gettimeofday(&t2, NULL);
+	t2 = tsub(t2, t1);
+
+	report_io_times("swapext", &t2, dest_offset, length, length, 1,
+			condensed);
 out:
 	close(fd);
 	return 0;
@@ -80,9 +212,9 @@ swapext_init(void)
 	swapext_cmd.name = "swapext";
 	swapext_cmd.cfunc = swapext_f;
 	swapext_cmd.argmin = 1;
-	swapext_cmd.argmax = 1;
+	swapext_cmd.argmax = -1;
 	swapext_cmd.flags = CMD_NOMAP_OK;
-	swapext_cmd.args = _("<donorfile>");
+	swapext_cmd.args = _("[-a] [-e] [-f] [-u] [-d dest_offset] [-s src_offset] [-l length] [-v swapext|exchrange] <donorfile>");
 	swapext_cmd.oneline = _("Swap extents between files.");
 	swapext_cmd.help = swapext_help;
 
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 56abe000f23..34f9ffe9433 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -708,10 +708,62 @@ bytes of data.
 .RE
 .PD
 .TP
-.BI swapext " donor_file "
+.BI "swapext [OPTIONS]" " donor_file "
 Swaps extent forks between files. The current open file is the target. The donor
 file is specified by path. Note that file data is not copied (file content moves
 with the fork(s)).
+Options include:
+.RS 1.0i
+.PD 0
+.TP 0.4i
+.B \-a
+Swap extent forks atomically.
+The filesystem must be able to complete the operation even if the system goes
+down.
+.TP
+.B \-C
+Print timing information in a condensed format.
+.TP
+.BI \-d " dest_offset"
+Swap extents with open file beginning at
+.IR dest_offset .
+.TP
+.B \-e
+Swap extents to the ends of both files, including the file sizes.
+.TP
+.B \-f
+Flush changed file data and file metadata to disk.
+.TP
+.B \-h
+Only swap written ranges in the supplied file.
+.TP
+.BI \-l " length"
+Swap up to
+.I length
+bytes of data.
+.TP
+.B \-n
+Perform all the parameter validation checks but don't change anything.
+.TP
+.BI \-s " src_offset"
+Swap extents with donor file beginning at
+.IR src_offset .
+.TP
+.B \-t
+Print timing information.
+.TP
+.B \-u
+Do not snapshot and compare the open file's timestamps.
+.TP
+.B \-v
+Use a particular version of the kernel interface.
+Currently supported values are
+.I xfs
+for the old XFS_IOC_SWAPEXT ioctl, and
+.I vfs
+for the new XFS_IOC_EXCHANGE_RANGE ioctl.
+.RE
+.PD
 .TP
 .BI "set_encpolicy [ \-c " mode " ] [ \-n " mode " ] [ \-f " flags " ] [ \-s " log2_dusize " ] [ \-v " version " ] [ " keyspec " ]"
 On filesystems that support encryption, assign an encryption policy to the

From patchwork Sun Dec 31 22:32:16 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507897
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C2729C13B
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 22:32:16 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Sq36Qbpy"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8FBD0C433C7;
	Sun, 31 Dec 2023 22:32:16 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704061936;
	bh=1ywWC2xbiqfdmx5FUwU+g9zLJDPumUosPue9JzwH5Ls=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Sq36QbpyJK77hDtNPGOmTsego3RIxco8nXM/cC/b0D50F6x7lRrZTGvx1r9xjDkLO
	 AnbqBOzLPgq3as9EXNnY6jXAGx9gdnNHcAp9D8Nj981pA44z1gyZQw4u5bgs6RKwXk
	 HmxXJvji27rCMSuz7vslPkkSza3Py82qxv4691JPOc4G0vSvJpKXF0eaoYE8DmofeT
	 x5xsfCj7oo2ST4Dz7l93GHlGsNK/oVD4TBZ6pLclBhTe2wKj0909fya8AndNo3BAdy
	 NeeNPd9RGUFnZmpt4dU2fQMBDmwGOjkm8q3of50RRyggGbunsuy12Tg09VjKcfD7Vi
	 ZfawQLB+Coa/w==
Date: Sun, 31 Dec 2023 14:32:16 -0800
Subject: [PATCH 20/20] xfs_io: add atomic update commands to exercise extent
 swapping
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404996542.1796128.1936470847803770043.stgit@frogsfrogsfrogs>
In-Reply-To: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
References: <170404996260.1796128.1530179577245518199.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add three commands to xfs_io so that we can exercise atomic file updates
as provided by reflink and atomic swapext.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 io/Makefile       |    2 
 io/atomicupdate.c |  386 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 io/init.c         |    1 
 io/io.h           |    5 +
 io/open.c         |   27 +++-
 man/man8/xfs_io.8 |   32 ++++
 6 files changed, 446 insertions(+), 7 deletions(-)
 create mode 100644 io/atomicupdate.c

diff --git a/io/Makefile b/io/Makefile
index 53fef09e899..1be6ab77d87 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -13,7 +13,7 @@ CFILES = init.c \
 	file.c freeze.c fsuuid.c fsync.c getrusage.c imap.c inject.c label.c \
 	link.c mmap.c open.c parent.c pread.c prealloc.c pwrite.c reflink.c \
 	resblks.c scrub.c seek.c shutdown.c stat.c swapext.c sync.c \
-	truncate.c utimes.c
+	truncate.c utimes.c atomicupdate.c
 
 LLDLIBS = $(LIBXCMD) $(LIBHANDLE) $(LIBFROG) $(LIBPTHREAD) $(LIBUUID)
 LTDEPENDENCIES = $(LIBXCMD) $(LIBHANDLE) $(LIBFROG)
diff --git a/io/atomicupdate.c b/io/atomicupdate.c
new file mode 100644
index 00000000000..07957b32c19
--- /dev/null
+++ b/io/atomicupdate.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "platform_defs.h"
+#include "command.h"
+#include "init.h"
+#include "io.h"
+#include "input.h"
+#include "libfrog/logging.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/file_exchange.h"
+
+struct update_info {
+	/* File object for the file that we're updating. */
+	struct xfs_fd		file_fd;
+
+	/* XFS_IOC_EXCHANGE_RANGE request to commit the changes. */
+	struct xfs_exch_range	xchg_req;
+
+	/* Name of the file we're updating. */
+	char			*old_fname;
+
+	/* fd we're using to stage the updates. */
+	int			temp_fd;
+};
+
+enum finish_how	{
+	FINISH_ABORT,
+	FINISH_COMMIT,
+	FINISH_CHECK
+};
+
+static struct update_info *updates;
+static unsigned int nr_updates;
+
+static void
+startupdate_help(void)
+{
+	printf(_(
+"\n"
+" Prepare for an atomic file update, if supported by the filesystem.\n"
+" A temporary file will be opened for writing and inserted into the file\n"
+" table.  The current file will be changed to this temporary file.  Neither\n"
+" file can be closed for the duration of the update.\n"
+"\n"
+" -e   -- Start with an empty file\n"
+"\n"));
+}
+
+static int
+startupdate_f(
+	int			argc,
+	char			*argv[])
+{
+	struct fsxattr		attr;
+	struct xfs_fsop_geom	fsgeom;
+	struct fs_path		fspath;
+	struct stat		stat;
+	struct update_info	*p;
+	char			*fname;
+	char			*path = NULL, *d;
+	size_t			fname_len;
+	int			flags = IO_TMPFILE | IO_ATOMICUPDATE;
+	int			temp_fd = -1;
+	bool			clone_file = true;
+	int			c;
+	int			ret;
+
+	while ((c = getopt(argc, argv, "e")) != -1) {
+		switch (c) {
+		case 'e':
+			clone_file = false;
+			break;
+		default:
+			startupdate_help();
+			return 0;
+		}
+	}
+	if (optind != argc) {
+		startupdate_help();
+		return 0;
+	}
+
+	/* Allocate a new slot. */
+	p = realloc(updates, (++nr_updates) * sizeof(*p));
+	if (!p) {
+		perror("startupdate realloc");
+		goto fail;
+	}
+	updates = p;
+
+	/* Fill out the update information so that we can commit later. */
+	p = &updates[nr_updates - 1];
+	memset(p, 0, sizeof(*p));
+	p->file_fd.fd = file->fd;
+	ret = xfd_prepare_geometry(&p->file_fd);
+	if (ret) {
+		xfrog_perror(ret, file->name);
+		goto fail;
+	}
+
+	ret = fstat(file->fd, &stat);
+	if (ret) {
+		perror(file->name);
+		goto fail;
+	}
+
+	/* Is the current file realtime?  If so, the temp file must match. */
+	ret = ioctl(file->fd, FS_IOC_FSGETXATTR, &attr);
+	if (ret == 0 && attr.fsx_xflags & FS_XFLAG_REALTIME)
+		flags |= IO_REALTIME;
+
+	/* Compute path to the directory that the current file is in. */
+	path = strdup(file->name);
+	d = strrchr(path, '/');
+	if (!d) {
+		fprintf(stderr, _("%s: cannot compute dirname?"), path);
+		goto fail;
+	}
+	*d = 0;
+
+	/* Open a temporary file to stage the extents. */
+	temp_fd = openfile(path, &fsgeom, flags, 0600, &fspath);
+	if (temp_fd < 0) {
+		perror(path);
+		goto fail;
+	}
+
+	/*
+	 * Snapshot the original file metadata in anticipation of the later
+	 * extent swap request.
+	 */
+	ret = xfrog_file_exchange_prep(&p->file_fd, XFS_EXCH_RANGE_COMMIT, 0,
+			temp_fd, 0, stat.st_size, &p->xchg_req);
+	if (ret) {
+		perror("update prep");
+		goto fail;
+	}
+
+	/* Clone all the data from the original file into the temporary file. */
+	if (clone_file) {
+		ret = ioctl(temp_fd, XFS_IOC_CLONE, p->file_fd.fd);
+		if (ret) {
+			perror(path);
+			goto fail;
+		}
+	}
+
+	/* Prepare a new path string for the duration of the update. */
+#define FILEUPDATE_STR	" (fileupdate)"
+	fname_len = strlen(file->name) + strlen(FILEUPDATE_STR);
+	fname = malloc(fname_len + 1);
+	if (!fname) {
+		perror("new path");
+		goto fail;
+	}
+	snprintf(fname, fname_len + 1, "%s%s", file->name, FILEUPDATE_STR);
+
+	/*
+	 * Install the temporary file into the same slot of the file table as
+	 * the original file.  Ensure that the original file cannot be closed.
+	 */
+	file->flags |= IO_ATOMICUPDATE;
+	p->old_fname = file->name;
+	file->name = fname;
+	p->temp_fd = file->fd = temp_fd;
+
+	free(path);
+	return 0;
+fail:
+	if (temp_fd >= 0)
+		close(temp_fd);
+	free(path);
+	nr_updates--;
+	exitcode = 1;
+	return 1;
+}
+
+static long long
+finish_update(
+	enum finish_how		how,
+	uint64_t		flags,
+	long long		*offset)
+{
+	struct update_info	*p;
+	long long		committed_bytes = 0;
+	size_t			length;
+	unsigned int		i;
+	unsigned int		upd_offset;
+	int			temp_fd;
+	int			ret;
+
+	/* Find our update descriptor. */
+	for (i = 0, p = updates; i < nr_updates; i++, p++) {
+		if (p->temp_fd == file->fd)
+			break;
+	}
+
+	if (i == nr_updates) {
+		fprintf(stderr,
+	_("Current file is not the staging file for an atomic update.\n"));
+		exitcode = 1;
+		return -1;
+	}
+
+	p->xchg_req.flags |= flags;
+
+	/*
+	 * Commit our changes, if desired.  If the extent swap fails, we stop
+	 * processing immediately so that we can run more xfs_io commands.
+	 */
+	switch (how) {
+	case FINISH_CHECK:
+		p->xchg_req.flags |= XFS_EXCH_RANGE_DRY_RUN;
+		fallthrough;
+	case FINISH_COMMIT:
+		ret = xfrog_file_exchange(&p->file_fd, &p->xchg_req);
+		if (ret) {
+			xfrog_perror(ret, _("committing update"));
+			exitcode = 1;
+			return -1;
+		}
+		printf(_("Committed updates to '%s'.\n"), p->old_fname);
+		*offset = p->xchg_req.file2_offset;
+		committed_bytes = p->xchg_req.length;
+		break;
+	case FINISH_ABORT:
+		printf(_("Cancelled updates to '%s'.\n"), p->old_fname);
+		break;
+	}
+
+	/*
+	 * Reset the filetable to point to the original file, and close the
+	 * temporary file.
+	 */
+	free(file->name);
+	file->name = p->old_fname;
+	file->flags &= ~IO_ATOMICUPDATE;
+	temp_fd = file->fd;
+	file->fd = p->file_fd.fd;
+	ret = close(temp_fd);
+	if (ret)
+		perror(_("closing temporary file"));
+
+	/* Remove the atomic update context, shifting things down. */
+	upd_offset = p - updates;
+	length = nr_updates * sizeof(struct update_info);
+	length -= (upd_offset + 1) * sizeof(struct update_info);
+	if (length)
+		memmove(p, p + 1, length);
+
+	nr_updates--;
+	return committed_bytes;
+}
+
+static void
+cancelupdate_help(void)
+{
+	printf(_(
+"\n"
+" Cancels an atomic file update.  The temporary file will be closed, and the\n"
+" current file set back to the original file.\n"
+"\n"));
+}
+
+static int
+cancelupdate_f(
+	int		argc,
+	char		*argv[])
+{
+	return finish_update(FINISH_ABORT, 0, NULL);
+}
+
+static void
+commitupdate_help(void)
+{
+	printf(_(
+"\n"
+" Commits an atomic file update.  File contents written to the temporary file\n"
+" will be swapped atomically with the corresponding range in the original\n"
+" file.  The temporary file will be closed, and the current file set back to\n"
+" the original file.\n"
+"\n"
+" -C   -- Print timing information in a condensed format.\n"
+" -h   -- Only swap written ranges in the temporary file.\n"
+" -k   -- Do not change file size.\n"
+" -n   -- Check parameters but do not change anything.\n"
+" -q   -- Do not print timing information at all.\n"));
+}
+
+static int
+commitupdate_f(
+	int		argc,
+	char		*argv[])
+{
+	struct timeval	t1, t2;
+	enum finish_how	how = FINISH_COMMIT;
+	uint64_t	flags = XFS_EXCH_RANGE_TO_EOF;
+	long long	offset, len;
+	int		condensed = 0, quiet_flag = 0;
+	int		c;
+
+	while ((c = getopt(argc, argv, "Chknq")) != -1) {
+		switch (c) {
+		case 'C':
+			condensed = 1;
+			break;
+		case 'h':
+			flags |= XFS_EXCH_RANGE_FILE1_WRITTEN;
+			break;
+		case 'k':
+			flags &= ~XFS_EXCH_RANGE_TO_EOF;
+			break;
+		case 'n':
+			how = FINISH_CHECK;
+			break;
+		case 'q':
+			quiet_flag = 1;
+			break;
+		default:
+			commitupdate_help();
+			return 0;
+		}
+	}
+	if (optind != argc) {
+		commitupdate_help();
+		return 0;
+	}
+
+	gettimeofday(&t1, NULL);
+	len = finish_update(how, flags, &offset);
+	if (len < 0)
+		return 1;
+	if (quiet_flag)
+		return 0;
+
+	gettimeofday(&t2, NULL);
+	t2 = tsub(t2, t1);
+	report_io_times("commitupdate", &t2, offset, len, len, 1, condensed);
+	return 0;
+}
+
+static struct cmdinfo startupdate_cmd = {
+	.name		= "startupdate",
+	.cfunc		= startupdate_f,
+	.argmin		= 0,
+	.argmax		= -1,
+	.flags		= CMD_FLAG_ONESHOT | CMD_NOMAP_OK,
+	.help		= startupdate_help,
+};
+
+static struct cmdinfo cancelupdate_cmd = {
+	.name		= "cancelupdate",
+	.cfunc		= cancelupdate_f,
+	.argmin		= 0,
+	.argmax		= 0,
+	.flags		= CMD_FLAG_ONESHOT | CMD_NOMAP_OK,
+	.help		= cancelupdate_help,
+};
+
+static struct cmdinfo commitupdate_cmd = {
+	.name		= "commitupdate",
+	.cfunc		= commitupdate_f,
+	.argmin		= 0,
+	.argmax		= -1,
+	.flags		= CMD_FLAG_ONESHOT | CMD_NOMAP_OK,
+	.help		= commitupdate_help,
+};
+
+void
+atomicupdate_init(void)
+{
+	startupdate_cmd.oneline = _("start an atomic update of a file");
+	startupdate_cmd.args = _("[-e]");
+
+	cancelupdate_cmd.oneline = _("cancel an atomic update");
+
+	commitupdate_cmd.oneline = _("commit a file update atomically");
+	commitupdate_cmd.args = _("[-C] [-h] [-n] [-q]");
+
+	add_command(&startupdate_cmd);
+	add_command(&cancelupdate_cmd);
+	add_command(&commitupdate_cmd);
+}
diff --git a/io/init.c b/io/init.c
index 104cd2c1215..a6c3d0cf147 100644
--- a/io/init.c
+++ b/io/init.c
@@ -44,6 +44,7 @@ init_cvtnum(
 static void
 init_commands(void)
 {
+	atomicupdate_init();
 	attr_init();
 	bmap_init();
 	bulkstat_init();
diff --git a/io/io.h b/io/io.h
index fe474faf4ad..a30b96401a7 100644
--- a/io/io.h
+++ b/io/io.h
@@ -31,6 +31,9 @@
 #define IO_PATH		(1<<10)
 #define IO_NOFOLLOW	(1<<11)
 
+/* undergoing atomic update, do not close */
+#define IO_ATOMICUPDATE	(1<<12)
+
 /*
  * Regular file I/O control
  */
@@ -74,6 +77,7 @@ extern int		openfile(char *, struct xfs_fsop_geom *, int, mode_t,
 				 struct fs_path *);
 extern int		addfile(char *, int , struct xfs_fsop_geom *, int,
 				struct fs_path *);
+extern int		closefile(void);
 extern void		printxattr(uint, int, int, const char *, int, int);
 
 extern unsigned int	recurse_all;
@@ -185,3 +189,4 @@ extern void		scrub_init(void);
 extern void		repair_init(void);
 extern void		crc32cselftest_init(void);
 extern void		bulkstat_init(void);
+extern void		atomicupdate_init(void);
diff --git a/io/open.c b/io/open.c
index 15850b5557b..a30dd89a1fd 100644
--- a/io/open.c
+++ b/io/open.c
@@ -338,14 +338,19 @@ open_f(
 	return 0;
 }
 
-static int
-close_f(
-	int		argc,
-	char		**argv)
+int
+closefile(void)
 {
 	size_t		length;
 	unsigned int	offset;
 
+	if (file->flags & IO_ATOMICUPDATE) {
+		fprintf(stderr,
+	_("%s: atomic update in progress, cannot close.\n"),
+			file->name);
+		exitcode = 1;
+		return 0;
+	}
 	if (close(file->fd) < 0) {
 		perror("close");
 		exitcode = 1;
@@ -371,7 +376,19 @@ close_f(
 		free(filetable);
 		file = filetable = NULL;
 	}
-	filelist_f();
+	return 0;
+}
+
+static int
+close_f(
+	int		argc,
+	char		**argv)
+{
+	int		ret;
+
+	ret = closefile();
+	if (!ret)
+		filelist_f();
 	return 0;
 }
 
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 34f9ffe9433..6ebb479a344 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1045,7 +1045,37 @@ sec uses UNIX timestamp notation and is the seconds elapsed since
 nsec is the nanoseconds since the sec. This value needs to be in
 the range 0-999999999 with UTIME_NOW and UTIME_OMIT being exceptions.
 Each (sec, nsec) pair constitutes a single timestamp value.
-
+.TP
+.BI "startupdate [ " -e ]
+Create a temporary clone of a file in which to stage file updates.
+The
+.B \-e
+option creates an empty staging file.
+.TP
+.B cancelupdate
+Abandon changes from a update staging file.
+.TP
+.BI "commitupdate [" OPTIONS ]
+Commit changes from a update staging file to the real file.
+.RS 1.0i
+.PD 0
+.TP 0.4i
+.B \-C
+Print timing information in a condensed format.
+.TP 0.4i
+.B \-h
+Only swap ranges in the update staging file that were actually written.
+.TP 0.4i
+.B \-k
+Do not change file size.
+.TP 0.4i
+.B \-n
+Check parameters without changing anything.
+.TP 0.4i
+.B \-q
+Do not print timing information at all.
+.PD
+.RE
 
 .SH MEMORY MAPPED I/O COMMANDS
 .TP