From patchwork Sun Dec 31 21:56:18 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507758
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 34DE811189
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:56:18 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="sQ2RzmTU"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id ADB58C433CA;
	Sun, 31 Dec 2023 21:56:18 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059778;
	bh=yFeOMuN68qYYPMEpSZyDLwjbySyE1VG7tN59Fa7ykSk=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=sQ2RzmTU7uPN9RxSavxOu7aeccWCK4KQ37gLumzjDqBlAugAzQOi28EpD2R84kb3f
	 /J0elOjULm6a3TMW82w5f3qo/advE4A9ly9e+yrKS7gchkSduuZWCIz3jPLllYiRRN
	 1Xf7iJ+ERnN6R5/lk8SjsaFx5f2v96ZCyYqDrnR9HadjXr+eGDD0gBxlIi9XQhAJHB
	 0f95saHjIA+DZx+BGU6RsWfaoyp3gAgyNYaWVc7H47WyP84k+Tz7f0GfXBXTbYzrHD
	 15OFVNvTL7llUvpNWgUegn6f99sdMCI6St5M/0WmP4+k9HflD/cM++55f87BuTUlPR
	 lQByHqqlNe2JQ==
Date: Sun, 31 Dec 2023 13:56:18 -0800
Subject: [PATCH 1/9] vfs: explicitly pass the block size to the remap prep
 function
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852686.1767395.2701297777524398388.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Make it so that filesystems can pass an explicit blocksize to the remap
prep function.  This enables filesystems whose fundamental allocation
units are /not/ the same as the blocksize to ensure that the remapping
checks are aligned properly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/dax.c           |    5 ++++-
 fs/remap_range.c   |   30 ++++++++++++++++++------------
 include/linux/fs.h |    3 ++-
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 3380b43cb6bbb..06b631eafb46c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -2061,7 +2061,10 @@ int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 			      loff_t *len, unsigned int remap_flags,
 			      const struct iomap_ops *ops)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, ops);
+					       pos_out, len, remap_flags, ops,
+					       blocksize);
 }
 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index f75ff15c94976..5d5b802f56086 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -30,18 +30,18 @@
  */
 static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
-				loff_t *req_count, unsigned int remap_flags)
+				loff_t *req_count, unsigned int remap_flags,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_in->f_mapping->host;
 	struct inode *inode_out = file_out->f_mapping->host;
 	uint64_t count = *req_count;
 	uint64_t bcount;
 	loff_t size_in, size_out;
-	loff_t bs = inode_out->i_sb->s_blocksize;
 	int ret;
 
 	/* The start of both ranges must be aligned to an fs block. */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+	if (!IS_ALIGNED(pos_in, blocksize) || !IS_ALIGNED(pos_out, blocksize))
 		return -EINVAL;
 
 	/* Ensure offsets don't wrap. */
@@ -75,10 +75,10 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 	 */
 	if (pos_in + count == size_in &&
 	    (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
-		bcount = ALIGN(size_in, bs) - pos_in;
+		bcount = ALIGN(size_in, blocksize) - pos_in;
 	} else {
-		if (!IS_ALIGNED(count, bs))
-			count = ALIGN_DOWN(count, bs);
+		if (!IS_ALIGNED(count, blocksize))
+			count = ALIGN_DOWN(count, blocksize);
 		bcount = count;
 	}
 
@@ -128,9 +128,10 @@ static int generic_remap_check_len(struct inode *inode_in,
 				   struct inode *inode_out,
 				   loff_t pos_out,
 				   loff_t *len,
-				   unsigned int remap_flags)
+				   unsigned int remap_flags,
+				   unsigned int blocksize)
 {
-	u64 blkmask = i_blocksize(inode_in) - 1;
+	u64 blkmask = blocksize - 1;
 	loff_t new_len = *len;
 
 	if ((*len & blkmask) == 0)
@@ -271,7 +272,8 @@ int
 __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *len, unsigned int remap_flags,
-				const struct iomap_ops *dax_read_ops)
+				const struct iomap_ops *dax_read_ops,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
@@ -306,7 +308,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	/* Check that we don't violate system file offset limits. */
 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -347,7 +349,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	}
 
 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -357,13 +359,17 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	return ret;
 }
+EXPORT_SYMBOL(__generic_remap_file_range_prep);
 
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *len, unsigned int remap_flags)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, NULL);
+					       pos_out, len, remap_flags, NULL,
+					       blocksize);
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d9badb63bfc29..aeead94a5b167 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2036,7 +2036,8 @@ int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
-				    const struct iomap_ops *dax_read_ops);
+				    const struct iomap_ops *dax_read_ops,
+				    unsigned int block_size);
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *count, unsigned int remap_flags);

From patchwork Sun Dec 31 21:56:33 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507759
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97BFC1171B
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:56:34 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Iqc0Zclb"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6759BC433C7;
	Sun, 31 Dec 2023 21:56:34 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059794;
	bh=cA56i82jDHrYXEthv9I4eCGmXAAuHI7VU4dLWfEg4vk=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Iqc0Zclb/fST+lAUbJ3xUduVENHjJxs5fYPF7AnzVqo6QGekH4RA9XTJTR4Uf9Wo/
	 GKeyEEoZUy1kQkMem4QkfIb/dvQULVPf4RsyEZJJFoF8i+87BWc4ALKrKumagKaQF9
	 3cxcsmQOzqnROk52yBcVGtLyUKp6fxGwa2mX7pglscknEBvjGWiDqzJR3oyBUYVmWg
	 UI/dRIVZ/JzqDHYx/SDL4tLbvJlSKXy4Qm19CCSZzh0e4jTIRRXq7tNP1+B5Kwk6Io
	 l3qMLXB7RLsijYEygRMnvUK72qxbAahDtR31x0BInd51heY2yBteVZCLV3+MtkiaMI
	 HklbvyQmYJyVA==
Date: Sun, 31 Dec 2023 13:56:33 -0800
Subject: [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1
 block
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852703.1767395.4031402007268981201.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Copy on write encounters a major plot twist when the file being CoW'd
lives on the realtime volume and the realtime extent size is larger than
a single filesystem block.  XFS can only unmap and remap full rt
extents, which means that allocations are always done in units of full
rt extents, and a request to unmap less than one extent is treated as a
request to convert an extent to unwritten status.

This behavioral quirk is not compatible with the existing CoW mechanism,
so we have to intercept every path through which files can be modified
to ensure that we dirty an entire rt extent at once so that we can remap
a full rt extent.  Use the existing VFS unshare functions to dirty the
page cache to set that up.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_file.c    |  224 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_file.h    |    3 +
 fs/xfs/xfs_inode.h   |    6 +
 fs/xfs/xfs_iops.c    |   22 +++++
 fs/xfs/xfs_reflink.c |   39 +++++++++
 fs/xfs/xfs_trace.h   |    1 
 6 files changed, 293 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index fdbeb6c3fbc44..ebdda286cb2a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -358,6 +358,116 @@ xfs_file_splice_read(
 	return ret;
 }
 
+/*
+ * Decide if this file write requires COWing-around at either end of the write
+ * range.  This is only required if the file allocation unit is larger than
+ * 1FSB and the write range is not aligned with the allocation unit.
+ */
+static bool
+xfs_file_write_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	/*
+	 * No COWing required if this inode doesn't do COW.
+	 *
+	 * If the allocation unit is 1FSB, we do not need to COW around the
+	 * edges of the operation range.  This applies to all files on the data
+	 * device and rt files that have an extent size of 1FSB.
+	 */
+	if (!xfs_inode_needs_cow_around(ip))
+		return false;
+
+	/*
+	 * Otherwise, check that the operation is aligned to the rt extent
+	 * size.  Any unaligned operation /must/ be COWed around since the
+	 * regular reflink code only handles extending writes up to fsblock
+	 * boundaries.
+	 */
+	return !xfs_is_falloc_aligned(ip, pos, count);
+}
+
+/* Do we need to COW-around at this offset to handle a truncate up or down? */
+bool
+xfs_truncate_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	return xfs_file_write_needs_cow_around(ip, pos, 0);
+}
+
+/* Does this file write require COWing around? */
+static inline bool
+xfs_iocb_needs_cow_around(
+	struct xfs_inode	*ip,
+	const struct kiocb	*iocb,
+	const struct iov_iter	*from)
+{
+	return xfs_file_write_needs_cow_around(ip, iocb->ki_pos,
+			iov_iter_count(from));
+}
+
+/* Unshare the allocation unit mapped to the given file position.  */
+inline int
+xfs_file_unshare_at(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	loff_t			isize = i_size_read(VFS_I(ip));
+	unsigned int		extsize, len;
+	uint32_t		mod;
+
+	len = extsize = xfs_inode_alloc_unitsize(ip);
+
+	/* Open-coded rounddown_64 so that we can skip out if aligned */
+	div_u64_rem(pos, extsize, &mod);
+	if (mod == 0)
+		return 0;
+	pos -= mod;
+
+	/* Do not extend the file. */
+	if (pos >= isize)
+		return 0;
+	if (pos + len > isize)
+		len = isize - pos;
+
+	trace_xfs_file_cow_around(ip, pos, len);
+
+	if (IS_DAX(VFS_I(ip)))
+		return dax_file_unshare(VFS_I(ip), pos, len,
+				&xfs_dax_write_iomap_ops);
+	return iomap_file_unshare(VFS_I(ip), pos, len,
+			&xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents.  In other words, we can only map and unmap
+ * full rt extents.  Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+static int
+xfs_file_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	int			error;
+
+	/* Unshare at the start of the extent. */
+	error = xfs_file_unshare_at(ip,  pos);
+	if (error)
+		return error;
+
+	/* Unshare at the end. */
+	return xfs_file_unshare_at(ip, pos + count);
+}
+
 /*
  * Common pre-write limit and setup checks.
  *
@@ -397,9 +507,10 @@ xfs_file_write_checks(
 
 	/*
 	 * For changing security info in file_remove_privs() we need i_rwsem
-	 * exclusively.
+	 * exclusively.  We also need it to COW around the range being written.
 	 */
-	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+	if (*iolock == XFS_IOLOCK_SHARED &&
+	    (!IS_NOSEC(inode) || xfs_iocb_needs_cow_around(ip, iocb, from))) {
 		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
 		error = xfs_ilock_iocb(iocb, *iolock);
@@ -410,6 +521,22 @@ xfs_file_write_checks(
 		goto restart;
 	}
 
+	/*
+	 * The write is not aligned to the file's allocation unit.  If either
+	 * of the allocation units at the start or end of the write range are
+	 * shared, unshare them through the page cache.
+	 */
+	if (xfs_iocb_needs_cow_around(ip, iocb, from)) {
+		ASSERT(*iolock == XFS_IOLOCK_EXCL);
+
+		inode_dio_wait(VFS_I(ip));
+		drained_dio = true;
+
+		error = xfs_file_cow_around(ip, iocb->ki_pos, count);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
@@ -461,6 +588,17 @@ xfs_file_write_checks(
 			goto restart;
 		}
 
+		/*
+		 * If we're starting the write past EOF, COW the allocation
+		 * unit containing the current EOF before we start zeroing the
+		 * range between EOF and the start of the write.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, isize)) {
+			error = xfs_file_unshare_at(ip, isize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
 		if (error)
@@ -575,6 +713,16 @@ xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
+	/*
+	 * If the range to write is not aligned to an allocation unit, we will
+	 * have to COW the allocation units on both ends of the write.  Because
+	 * this runs through the page cache, it requires IOLOCK_EXCL.  This
+	 * predicate performs an unlocked access of the rt and reflink inode
+	 * state.
+	 */
+	if (xfs_iocb_needs_cow_around(ip, iocb, from))
+		iolock = XFS_IOLOCK_EXCL;
+
 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
@@ -927,6 +1075,13 @@ xfs_file_fallocate(
 		goto out_unlock;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		/* Unshare around the region to punch, if needed. */
+		if (xfs_file_write_needs_cow_around(ip, offset, len)) {
+			error = xfs_file_cow_around(ip, offset, len);
+			if (error)
+				goto out_unlock;
+		}
+
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
@@ -997,6 +1152,13 @@ xfs_file_fallocate(
 
 			trace_xfs_zero_file_space(ip);
 
+			/* Unshare around the region to zero, if needed. */
+			if (xfs_file_write_needs_cow_around(ip, offset, len)) {
+				error = xfs_file_cow_around(ip, offset, len);
+				if (error)
+					goto out_unlock;
+			}
+
 			error = xfs_free_file_space(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1005,6 +1167,26 @@ xfs_file_fallocate(
 			      round_down(offset, blksize);
 			offset = round_down(offset, blksize);
 		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			/*
+			 * Enlarge the unshare region to align to a full
+			 * allocation unit.
+			 */
+			if (xfs_inode_needs_cow_around(ip)) {
+				loff_t		isize = i_size_read(VFS_I(ip));
+				unsigned int	rextsize;
+				uint32_t	mod;
+
+				rextsize = xfs_inode_alloc_unitsize(ip);
+				div_u64_rem(offset, rextsize, &mod);
+				offset -= mod;
+				len += mod;
+
+				div_u64_rem(offset + len, rextsize, &mod);
+				if (mod)
+					len += rextsize - mod;
+				if (offset + len > isize)
+					len = isize - offset;
+			}
 			error = xfs_reflink_unshare(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1272,6 +1454,34 @@ xfs_dax_fault(
 }
 #endif
 
+static int
+xfs_filemap_fault_around(
+	struct vm_fault		*vmf,
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct folio		*folio = page_folio(vmf->page);
+	loff_t			pos;
+	ssize_t			len;
+
+	if (!xfs_inode_needs_cow_around(ip))
+		return 0;
+
+	folio_lock(folio);
+	len = folio_mkwrite_check_truncate(folio, inode);
+	if (len < 0) {
+		folio_unlock(folio);
+		return len;
+	}
+	pos = folio_pos(folio);
+	folio_unlock(folio);
+
+	if (!xfs_file_write_needs_cow_around(ip, pos, len))
+		return 0;
+
+	return xfs_file_cow_around(XFS_I(inode), pos, len);
+}
+
 /*
  * Locking for serialisation of IO during page faults. This results in a lock
  * ordering of:
@@ -1310,11 +1520,21 @@ __xfs_filemap_fault(
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
 	} else if (write_fault) {
+		/*
+		 * Unshare all the blocks in this rt extent surrounding
+		 * this page.
+		 */
+		int error = xfs_filemap_fault_around(vmf, inode);
+		if (error) {
+			ret = vmf_fs_error(error);
+			goto out_unlock;
+		}
 		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
 		ret = filemap_fault(vmf);
 	}
 
+out_unlock:
 	if (lock_mode)
 		xfs_iunlock(XFS_I(inode), lock_mode);
 
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
index 2ad91f755caf3..24490ea49e16c 100644
--- a/fs/xfs/xfs_file.h
+++ b/fs/xfs/xfs_file.h
@@ -12,4 +12,7 @@ extern const struct file_operations xfs_dir_file_operations;
 bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
 		long long int len);
 
+bool xfs_truncate_needs_cow_around(struct xfs_inode *ip, loff_t pos);
+int xfs_file_unshare_at(struct xfs_inode *ip, loff_t pos);
+
 #endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6013a97d02c5d..df8197fe4cb82 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -331,6 +331,12 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
 	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
 }
 
+/* Decide if we need to unshare the blocks around a range that we're writing. */
+static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip)
+{
+	return xfs_is_cow_inode(ip) && xfs_inode_has_bigallocunit(ip);
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 461e77dd54e38..71a4398fd36ac 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -27,6 +27,7 @@
 #include "xfs_xattr.h"
 #include "xfs_file.h"
 #include "xfs_bmap.h"
+#include "xfs_reflink.h"
 
 #include <linux/posix_acl.h>
 #include <linux/security.h>
@@ -877,10 +878,31 @@ xfs_setattr_size(
 	 * truncate.
 	 */
 	if (newsize > oldsize) {
+		/*
+		 * Extending the file size, so COW around the allocation unit
+		 * containing EOF before we zero the new range of the file.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, oldsize)) {
+			error = xfs_file_unshare_at(ip, oldsize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
 		error = xfs_zero_range(ip, oldsize, newsize - oldsize,
 				&did_zeroing);
 	} else {
+		/*
+		 * Truncating the file, so COW around the new EOF allocation
+		 * unit before truncation zeroes the part of the EOF block
+		 * after the new EOF.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, newsize)) {
+			error = xfs_file_unshare_at(ip, newsize);
+			if (error)
+				return error;
+		}
+
 		/*
 		 * iomap won't detect a dirty page over an unwritten block (or a
 		 * cow block over a hole) and subsequently skips zeroing the
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b0f3170c11c8b..d5773f9b7ec54 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -34,6 +34,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_imeta.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -297,9 +298,26 @@ xfs_reflink_convert_cow_locked(
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	got;
 	struct xfs_btree_cur	*dummy_cur = NULL;
+	struct xfs_mount	*mp = ip->i_mount;
 	int			dummy_logflags;
 	int			error = 0;
 
+	/*
+	 * We can only remap full rt extents, so make sure that we convert the
+	 * entire extent.  The caller must ensure that this is either a direct
+	 * write that's aligned to the rt extent size, or a buffered write for
+	 * which we've dirtied extra pages to make this work properly.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		xfs_fileoff_t	new_off;
+
+		new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb);
+		count_fsb += offset_fsb - new_off;
+		offset_fsb = new_off;
+
+		count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb);
+	}
+
 	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 		return 0;
 
@@ -635,11 +653,21 @@ xfs_reflink_cancel_cow_blocks(
 	bool				cancel_real)
 {
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
 	bool				isrt = XFS_IS_REALTIME_INODE(ip);
 	int				error = 0;
 
+	/*
+	 * Shrink the range that we're cancelling if they don't align to the
+	 * realtime extent size, since we can only free full extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb);
+		end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb);
+	}
+
 	if (!xfs_inode_has_cow_data(ip))
 		return 0;
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
@@ -946,6 +974,7 @@ xfs_reflink_end_cow(
 	xfs_off_t			offset,
 	xfs_off_t			count)
 {
+	struct xfs_mount		*mp = ip->i_mount;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			end_fsb;
 	int				error = 0;
@@ -955,6 +984,16 @@ xfs_reflink_end_cow(
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
 
+	/*
+	 * Make sure the end is aligned with a rt extent (if desired), since
+	 * the end of the range could be EOF.  The _convert_cow function should
+	 * have set us up to swap only full rt extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb);
+		end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
+	}
+
 	/*
 	 * Walk forwards until we've remapped the I/O range.  The loop function
 	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f94f144f9a39d..643cffaf3add2 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3892,6 +3892,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 #ifdef CONFIG_XFS_RT
 DEFINE_SIMPLE_IO_EVENT(xfs_convert_bigalloc_file_space);

From patchwork Sun Dec 31 21:56:49 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507760
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 362631171B
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:56:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="ITq7QEzW"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 08DC3C433C8;
	Sun, 31 Dec 2023 21:56:49 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059810;
	bh=gZYA1weZfJ3HP/d7VEAjufvv33VpmlWVw4WZno1SNsY=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=ITq7QEzWGlIKHkZ6KCqDeAlyTeqhV8KYdmbJ4EV9mct6TYjLYb7i/UH7sKyUz4Dw1
	 KsNzANGa5pqlZyeVHg+7T+RjV+mxF3sI/jy5Vt6rqSC7Z6BBsVwtF7u6MJoetO8zv4
	 IzvgSn8u5h9NTsQYs6nvWXXwC3CDIFOEB5qQ1ZOqK2imgFXOuRsMifL6o/2Wbn2Pzf
	 xF2HRcahtMW5IpRWQGQ12JNlkZwBwCFOr/iRSQSuGeSP7S+2sViytZ+tXPSAd9QfAu
	 5xySjwYOb8e9KGoLS4nxTHhhaqgi0+thIkfrytAjPjhW9oGZqBj/R4PK0m4UZqDO2R
	 ffFZjSRlNNI2Q==
Date: Sun, 31 Dec 2023 13:56:49 -0800
Subject: [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt
 extent before sharing
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852719.1767395.5382072798264653253.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

As noted in the previous patch, XFS can only unmap and map full rt
extents.  This means that we cannot stop mid-extent for any reason,
including stepping around unwritten/written extents.  Second, the
reflink and CoW mechanisms were not designed to handle shared unwritten
extents, so we have to do something to get rid of them.

If the user asks us to remap two files, we must scan both ranges
beforehand to convert any unwritten extents that are not aligned to rt
extent boundaries into zeroed written extents before sharing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d5773f9b7ec54..5d68603506f27 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1693,6 +1693,25 @@ xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * Now that we've marked both inodes for reflink, make sure that all
+	 * allocation units (AU) mapped into either files' ranges are either
+	 * wholly written, wholly unwritten, or holes.  The bmap code requires
+	 * that we align all unmap and remap requests to an AU.  We've already
+	 * flushed the page cache and finished directio for the range that's
+	 * being remapped, so we can convert the mappings directly.
+	 */
+	if (xfs_inode_has_bigallocunit(src)) {
+		ret = xfs_convert_bigalloc_file_space(src, pos_in, *len);
+		if (ret)
+			goto out_unlock;
+	}
+	if (xfs_inode_has_bigallocunit(dest)) {
+		ret = xfs_convert_bigalloc_file_space(dest, pos_out, *len);
+		if (ret)
+			goto out_unlock;
+	}
+
 	/*
 	 * If pos_out > EOF, we may have dirtied blocks between EOF and
 	 * pos_out. In that case, we need to extend the flush and unmap to cover

From patchwork Sun Dec 31 21:57:05 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507761
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DA76111704
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:57:05 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="iQM578Bo"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id A9F1DC433C8;
	Sun, 31 Dec 2023 21:57:05 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059825;
	bh=vg76k9acyABRI+YhTu1zMJQdFknTjOSlaq0rncKsg04=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=iQM578BoCxjmo6MCYdVbSW+bmBV4K+Y7uhk7SFl4i/ZcrshJdTz6aX2H7Cy973qPS
	 mr+n5fdnftj0r+v0skDDpO29QChKnDt4hJXJvZ0imdk68FMQQLs/qNbYUtvFNgEAEf
	 w0CyLCtkkJvckDGyWFCgtKO1ZfXiyHDWlHC7jhQiuxO/RIR9qzeTCeLbCr81w6RTXD
	 U8H5iuWyBitB6PIaNPGpguj3E4gCL5dxR6PTEpUQKftFZ8DRXXO3yY7A0eQUnK4idI
	 /fHIjKvLOK2XIsuQYPPmBuNr23s4Q9gl7TtKbM1tX6AYQrD5Gp5ywRJsSLuDsIFxUI
	 y6lA3qH0/OR1A==
Date: Sun, 31 Dec 2023 13:57:05 -0800
Subject: [PATCH 4/9] xfs: add some tracepoints for writeback
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852735.1767395.8058627554300848243.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a tracepoint so I can see where writeback is initiated.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   19 ++++++++++++-------
 fs/xfs/xfs_trace.h |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3001ddf48d6c6..1217ce197ad98 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -497,10 +497,11 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 
 STATIC int
 xfs_vm_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_writepage_ctx wpc = { };
+	struct xfs_writepage_ctx	wpc = { };
+	struct xfs_inode		*ip = XFS_I(mapping->host);
 
 	/*
 	 * Writing back data in a transaction context can result in recursive
@@ -509,16 +510,20 @@ xfs_vm_writepages(
 	if (WARN_ON_ONCE(current->journal_info))
 		return 0;
 
-	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	trace_xfs_vm_writepages(ip, wbc);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
 
 STATIC int
 xfs_dax_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_inode	*ip = XFS_I(mapping->host);
+	struct xfs_inode		*ip = XFS_I(mapping->host);
+
+	trace_xfs_dax_writepages(ip, wbc);
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 643cffaf3add2..39df20ae702c8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1552,6 +1552,40 @@ DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_found);
 
+DECLARE_EVENT_CLASS(xfs_writeback_class,
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc),
+	TP_ARGS(ip, wbc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, range_start)
+		__field(loff_t, range_end)
+		__field(long, nr_to_write)
+		__field(enum writeback_sync_modes, sync_mode)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->range_start = wbc->range_start;
+		__entry->range_end = wbc->range_end;
+		__entry->nr_to_write = wbc->nr_to_write;
+		__entry->sync_mode = wbc->sync_mode;
+	),
+	TP_printk("dev %d:%d ino 0x%llx range_start 0x%llx range_end 0x%llx nr_to_write %ld sync_mode %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->range_start,
+		   __entry->range_end,
+		   __entry->nr_to_write,
+		   __entry->sync_mode)
+);
+#define DEFINE_WRITEBACK_EVENT(name)	\
+DEFINE_EVENT(xfs_writeback_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
+	TP_ARGS(ip, wbc))
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
+
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
 	TP_ARGS(ip, offset, count),

From patchwork Sun Dec 31 21:57:20 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507762
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D575711704
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:57:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="KnzKGD/G"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 57E1DC433C8;
	Sun, 31 Dec 2023 21:57:21 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059841;
	bh=2p5a3yxnr9HTGuBjAk30UqelWBu9sZ+ILLpyRkjs1Js=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=KnzKGD/Gm5ptuisy7GWtDlhKky4IL2fEwePzl9wNiyfW6DrXtRbl1shjTIDxBGEti
	 vRHyI60kuMk2e14bTCWUIzdNPGtJqMj0i0aL1KznJPzfSM2YnbLnl4ovFNzsk1TwXN
	 cfkLoqT03wdQ62coxEI/Z0CG9O3YoFfay1BN8plVSEyUlvUIqLnpuscWAlG6Makdgl
	 6v+tugt3xNOReocRDXB6zMoFVlhGS9lkdeGgFxW/OmIW46GxTUYaWqiW631gb8g9ZJ
	 9gCbZsNW3nYpGHADi27ZXPo5kuEXS4b8eqLwnb7o88TwfEp53H03/xLsyq+EfzqcI8
	 cYxDOaPGj1O1w==
Date: Sun, 31 Dec 2023 13:57:20 -0800
Subject: [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852751.1767395.2816430586609993508.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

If we have shared realtime files and the rt extent size is larger than a
single fs block, we need to extend writeback requests to be aligned to
rt extent size granularity because we cannot share partial rt extents.
The front end should have set us up for this by dirtying the relevant
ranges.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   38 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h |    1 +
 2 files changed, 39 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1217ce197ad98..b6ef76ee65f5e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,6 +495,38 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 	.discard_folio		= xfs_discard_folio,
 };
 
+/*
+ * Extend the writeback range to allocation unit granularity and alignment.
+ * This is a requirement for blocksize > pagesize scenarios such as realtime
+ * copy on write, since we can only share full rt extents.
+ */
+static inline void
+xfs_vm_writepages_extend(
+	struct xfs_inode		*ip,
+	struct writeback_control	*wbc)
+{
+	unsigned int			bsize = xfs_inode_alloc_unitsize(ip);
+	long long int			pages_to_write;
+	loff_t				next = wbc->range_end + 1;
+
+	wbc->range_start = rounddown_64(wbc->range_start, bsize);
+	if (wbc->range_end != LLONG_MAX)
+		wbc->range_end = roundup_64(next, bsize) - 1;
+
+	if (wbc->nr_to_write != LONG_MAX) {
+		pgoff_t		pg_start = wbc->range_start >> PAGE_SHIFT;
+		pgoff_t		pg_next = (wbc->range_end + 1) >> PAGE_SHIFT;
+
+		pages_to_write = pg_next - pg_start;
+		if (pages_to_write >= LONG_MAX)
+			pages_to_write = LONG_MAX;
+		if (wbc->nr_to_write < pages_to_write)
+			wbc->nr_to_write = pages_to_write;
+	}
+
+	trace_xfs_vm_writepages_extend(ip, wbc);
+}
+
 STATIC int
 xfs_vm_writepages(
 	struct address_space		*mapping,
@@ -512,6 +544,9 @@ xfs_vm_writepages(
 
 	trace_xfs_vm_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
@@ -525,6 +560,9 @@ xfs_dax_writepages(
 
 	trace_xfs_dax_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 39df20ae702c8..4767fc49c4641 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1584,6 +1584,7 @@ DEFINE_EVENT(xfs_writeback_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
 	TP_ARGS(ip, wbc))
 DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages_extend);
 DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,

From patchwork Sun Dec 31 21:57:36 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507763
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 25A5D11704
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:57:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="gY6nJU5b"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id E5A14C433C8;
	Sun, 31 Dec 2023 21:57:36 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059857;
	bh=5Rr9JlF3EjsLQbhtb33DDUNhRGF5Q7iAcUYO9H9g3ow=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=gY6nJU5bfxdx59/YKspvRGazZ60M1OBhd0NzahSgXRLyZwV1awPesqIdwpjXyAM0O
	 RIpVZfqsxRTDiaqrFy1FwV0C+unzkyFfqQBfijy730Yq4LmhDpOjmfPOcTa7fT8Vyn
	 S2592wVMcbU+Ela4c9FTfw/O+V1wKsQbwdGWCjX+ZrKlpXcY0pTuMIi73nC+5Wlk/v
	 lG0/HULmIUu+OUrcXqbqHMKIHYci75rAWyWUOPLHLp+fvEmoxlDAEuof3v6NxSqE7O
	 17Jf9y0kzC4BeN8utcFvK78Yh9+Z7cYiQi1wzqlUAdbAhTFZtK4IG0Fsk/6tc5Bfhi
	 /pXu1BeCrfqkw==
Date: Sun, 31 Dec 2023 13:57:36 -0800
Subject: [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852767.1767395.6394408204682530311.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

CoW extent size hints are not allowed on filesystems that have large
realtime extents because we only want to perform the minimum required
amount of write-around (aka write amplification) for shared extents.

On filesystems where rtextsize > 1, allocations can only be done in
units of full rt extents, which means that we can only map an entire rt
extent's worth of blocks into the data fork.  Hole punch requests become
conversions to unwritten if the request isn't aligned properly.

Because a copy-write fundamentally requires remapping, this means that
we also can only do copy-writes of a full rt extent.  This is too
expensive for large hint sizes, since it's all or nothing.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 41354bdbbc90f..c6ab6a38bc7cd 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6408,6 +6408,28 @@ xfs_get_cowextsz_hint(
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
 	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * For realtime files, the realtime extent is the fundamental
+		 * unit of allocation.  This means that data sharing and CoW
+		 * remapping can only be done in those units.  For filesystems
+		 * where the extent size is larger than one block, write
+		 * requests that are not aligned to an extent boundary employ
+		 * an unshare-around strategy to ensure that all pages for a
+		 * shared extent are fully dirtied.
+		 *
+		 * Because the remapping alignment requirement applies equally
+		 * to all CoW writes, any regular overwrites that could be
+		 * turned (by a speculative CoW preallocation) into a CoW write
+		 * must either employ this dirty-around strategy, or be smart
+		 * enough to ignore the CoW fork mapping unless the entire
+		 * extent is dirty or becomes shared by writeback time.  Doing
+		 * the first would dramatically increase write amplification,
+		 * and the second would require deeper insight into the state
+		 * of the page cache during a writeback request.  For now, we
+		 * ignore the hint.
+		 */
+		if (ip->i_mount->m_sb.sb_rextsize > 1)
+			return ip->i_mount->m_sb.sb_rextsize;
 		b = 0;
 		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 			b = ip->i_extsize;

From patchwork Sun Dec 31 21:57:52 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507764
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D1A6C11719
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:57:52 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="KDxIVo/E"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id A2D31C433C7;
	Sun, 31 Dec 2023 21:57:52 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059872;
	bh=JTrp0euvZhFwTlDQTknBtC435Kc/a8OpWqmBPFVOONI=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=KDxIVo/E+KTDgjgIb8hBrnaB/iKPoxQXMD47zrxLLga7CGJJqtIzMgkpCyJgd9NwI
	 pLl2UWxBr9xz2Tif663r/ZLtR4yBsVB1NXqxpLdEvpDj3kk7zhVFWZIwcsr93o8R6p
	 lKZ5DR5J4sn24teer/gmmGhF5NrEdEXm+1Ds1ILkJz+GOaYmnete9nAfhAO1Q7957c
	 6xcJGybLQoKizU75isttunlivjsAohZyQjkfXWPKCVV43Xtyu3tdnBLVr/cQmn1FKr
	 Xa8+vs+8PwpBCQzKWmNb9Ypm/BC+us8g6ucjbH21N3VeIX4lKFywcPg/oNEQGKDqGY
	 6HWcv7oJBGZwQ==
Date: Sun, 31 Dec 2023 13:57:52 -0800
Subject: [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is
 larger than 1 rt block
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852783.1767395.4576654556100736347.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Make the necessary tweaks to the reflink remapping code to support
remapping on the realtime volume when the rt extent size is larger than
a single rt block.  We need to check that the remap arguments from
userspace are aligned to a rt extent boundary, and that the length
is always aligned, even if the kernel tried to round it up to EOF for
us.  XFS can only map and remap full rt extents, so we have to be a
little more strict about the alignment there.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_rtalloc.c |    2 +
 fs/xfs/xfs_super.c   |   19 +++++++++---
 fs/xfs/xfs_trace.h   |    3 ++
 4 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5d68603506f27..d516f3a35df36 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1534,6 +1534,13 @@ xfs_reflink_remap_blocks(
 	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
 			XFS_MAX_FILEOFF);
 
+	/*
+	 * Make sure the end is aligned with an allocation unit, even if it's
+	 * past EOF.
+	 */
+	if (xfs_inode_has_bigallocunit(dest))
+		len = xfs_rtb_roundup_rtx(mp, len);
+
 	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
 
 	while (len > 0) {
@@ -1607,6 +1614,57 @@ xfs_reflink_zero_posteof(
 	return xfs_zero_range(ip, isize, pos - isize, NULL);
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Adjust the length of the remap operation to end on an allocation unit (AU)
+ * boundary.
+ */
+STATIC int
+xfs_reflink_adjust_bigalloc_len(
+	struct xfs_inode	*src,
+	loff_t			pos_in,
+	struct xfs_inode	*dest,
+	loff_t			pos_out,
+	loff_t			*len,
+	unsigned int		remap_flags)
+{
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(src);
+	uint32_t		mod;
+
+	div_u64_rem(*len, alloc_unit, &mod);
+
+	/*
+	 * We previously checked the AU alignment of both offsets, so we now
+	 * have to check the AU alignment of the length.  The VFS remap prep
+	 * function can change the length on us, so we can only make length
+	 * adjustments after that.  If the length is aligned to an AU, we're
+	 * good to go.
+	 *
+	 * Otherwise, the length is not aligned to an AU.  If the source file's
+	 * range ends at EOF, the VFS ensured that the dest file's range also
+	 * ends at EOF.  The actual remap function will round the (byte) length
+	 * up to the nearest AU, so we're ok here too.
+	 */
+	if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
+		return 0;
+
+	/*
+	 * Otherwise, the only thing we can do is round the request length down
+	 * to an AU boundary.  If the caller doesn't allow that, we cannot move
+	 * forward.
+	 */
+	if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return -EINVAL;
+
+	/* Back off by a single extent. */
+	(*len) -= mod;
+	trace_xfs_reflink_adjust_bigalloc_len(src, pos_in, *len, dest, pos_out);
+	return 0;
+}
+#else
+# define xfs_reflink_adjust_bigalloc_len(...)		(0)
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1649,6 +1707,7 @@ xfs_reflink_remap_prep(
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
+	const struct iomap_ops	*dax_read_ops = NULL;
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1666,15 +1725,25 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	if (!IS_DAX(inode_in))
-		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags);
-	else
-		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags, &xfs_read_iomap_ops);
+	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+
+	if (IS_DAX(inode_in))
+		dax_read_ops = &xfs_read_iomap_ops;
+
+	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
+			pos_out, len, remap_flags, dax_read_ops,
+			xfs_inode_alloc_unitsize(dest));
 	if (ret || *len == 0)
 		goto out_unlock;
 
+	/* Adjust the end to align to an allocation unit. */
+	if (xfs_inode_has_bigallocunit(src)) {
+		ret = xfs_reflink_adjust_bigalloc_len(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret || *len == 0)
+			goto out_unlock;
+	}
+
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 11b6645a5a534..c617c326125b3 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1253,7 +1253,7 @@ xfs_growfs_rt(
 		return -EOPNOTSUPP;
 	if (xfs_has_quota(mp))
 		return -EOPNOTSUPP;
-	if (xfs_has_reflink(mp) && in->extsize != 1)
+	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bf0c0ce9a54b9..c17e1d06820d1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1729,13 +1729,22 @@ xfs_fs_fill_super(
 
 	if (xfs_has_reflink(mp)) {
 		/*
-		 * Reflink doesn't support rt extent sizes larger than a single
-		 * block because we would have to perform unshare-around for
-		 * rtext-unaligned write requests.
+		 * Reflink doesn't support pagecache pages that span multiple
+		 * realtime extents because iomap doesn't track subpage dirty
+		 * state.  This means that we cannot dirty all the pages
+		 * backing an rt extent without dirtying the adjoining rt
+		 * extents.  If those rt extents are shared and extend into
+		 * other pages, this leads to crazy write amplification.  The
+		 * VFS remap_range checks assume power-of-two block sizes.
+		 *
+		 * Hence we only support rt extent sizes that are an integer
+		 * power of two because we know those will align with the page
+		 * size.
 		 */
-		if (xfs_has_realtime(mp) && mp->m_sb.sb_rextsize != 1) {
+		if (xfs_has_realtime(mp) &&
+		    !is_power_of_2(mp->m_sb.sb_rextsize)) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime extent size %u!",
+	"reflink not compatible with non-power-of-2 realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4767fc49c4641..906e35eef223d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3887,6 +3887,9 @@ TRACE_EVENT(xfs_reflink_remap_blocks,
 		  __entry->dest_lblk)
 );
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+#ifdef CONFIG_XFS_RT
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_adjust_bigalloc_len);
+#endif /* CONFIG_XFS_RT */
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);

From patchwork Sun Dec 31 21:58:07 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507765
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D52A211704
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:58:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="RyUSv/e/"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 49786C433C7;
	Sun, 31 Dec 2023 21:58:08 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059888;
	bh=q8OdXz0+so6B6GeDCKtdbJYvElax7dazHtawKdCqHuw=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=RyUSv/e/zopxH1cch1RoJF6V5Ba5b2Wd+inSKAti8leU1bgKDdHHGct+l6x/uNjtr
	 ynNSclKQ2/Jsi4tvB7it46Ybw/1zfefaFNNl1IifHGeeoWcSZtRJif0wl21O8mNK4H
	 Lrsnkq1tJyMiOW679j83e5eVPPa7EGtcTAwYq/jGfIE/t+3zRj4JPj0dlgYm57Czei
	 TZvC6Kskn0jCnltQx/zzOXAtJLOiUcYfADunXt/oQ+1ea0H+O0yC1rJ3enG1zUQwqR
	 8xoOvICsnbjZRso4N/lbA8SNV2NVPwoA7CyFZSXE0kdgnDTDkmKcYI14/LuTnFxxPC
	 2C50fEZi2uVfA==
Date: Sun, 31 Dec 2023 13:58:07 -0800
Subject: [PATCH 8/9] xfs: fix integer overflow when validating extent size
 hints
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852799.1767395.13164066067840149369.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Both file extent size hints are stored as 32-bit quantities, in units of
filesystem blocks.  As part of validating the hints, we convert these
quantities to bytes to ensure that the hint is congruent with the file's
allocation size.

The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN).
If the file allocation unit is larger than 2048, the unit conversion
will exceed 32 bits in size, which overflows the uint32_t used to store
the value used in the comparison.  This isn't a problem for files on the
data device since the hint will always be a multiple of the block size.
However, this is a problem for realtime files because the rtextent size
can be any integer number of fs blocks, and truncation of upper bits
changes the outcome of division.

Eliminate the overflow by performing the congruency check in units of
blocks, not bytes.  Otherwise, we get errors like this:

$ truncate -s 500T /tmp/a
$ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k
illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 81a12ca8ec434..adc457da52ef0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -770,13 +770,11 @@ xfs_inode_validate_extsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	bool				inherit_flag;
-	uint32_t			extsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
 	inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
-	extsize_bytes = XFS_FSB_TO_B(mp, extsize);
 
 	/*
 	 * This comment describes a historic gap in this verifier function.
@@ -805,9 +803,7 @@ xfs_inode_validate_extsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
 		return __this_address;
@@ -825,7 +821,7 @@ xfs_inode_validate_extsize(
 	if (mode && !(hint_flag || inherit_flag) && extsize != 0)
 		return __this_address;
 
-	if (extsize_bytes % blocksize_bytes)
+	if (extsize % alloc_unit)
 		return __this_address;
 
 	if (extsize > XFS_MAX_BMBT_EXTLEN)
@@ -860,12 +856,10 @@ xfs_inode_validate_cowextsize(
 {
 	bool				rt_flag;
 	bool				hint_flag;
-	uint32_t			cowextsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
-	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
 	/*
 	 * Similar to extent size hints, a directory can be configured to
@@ -880,9 +874,7 @@ xfs_inode_validate_cowextsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
@@ -897,7 +889,7 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (cowextsize_bytes % blocksize_bytes)
+	if (cowextsize % alloc_unit)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)

From patchwork Sun Dec 31 21:58:23 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13507766
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19359BA43
	for <linux-xfs@vger.kernel.org>; Sun, 31 Dec 2023 21:58:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="OM/+kaHs"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id E006EC433C7;
	Sun, 31 Dec 2023 21:58:23 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1704059903;
	bh=I/yVVacjB0X38rOipjRkfFkp0EjKgmJTpfQfV4GkDgg=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=OM/+kaHsUaNjkODcQsOwwIoBB3XBBCIehtCYKLIP/Aux/0JiaXcsltHx5xyK4Xvom
	 3Kx3bsEi1+wdgzvKiTeoyVEzIuas9TQ2Be9Ql2Y4a4KxLIigWU0M09bYumdkXhQsqE
	 7XsMKzosGplhahQaUueL5HkMbmr1tZWE3cKkH4OEin1cxfYGdHMhqNJUYt5pUnGWqX
	 XD8fZHICZiZmbfRYblZEkMxOjEnzZLcCYGHN6GA78ikMAdxy2G380hr3TC5sw+uMs7
	 1Beim+XwMvBkc++jEFlFQeXTW39Pz1VNmPNyzDtqibylvI88nPiYMAFOTyiUUsC0uV
	 Qte6DHy/czu/Q==
Date: Sun, 31 Dec 2023 13:58:23 -0800
Subject: [PATCH 9/9] xfs: support realtime reflink with an extent size that
 isn't a power of 2
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Message-ID: <170404852815.1767395.842469542235107122.stgit@frogsfrogsfrogs>
In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs>
User-Agent: StGit/0.19
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add the necessary alignment checking code to the reflink remap code to
ensure that remap requests are aligned to rt extent boundaries if the
realtime extent size isn't a power of two.  The VFS helpers assume that
they can use the usual (blocksize - 1) masking to avoid slow 64-bit
division, but since XFS is special we won't make everyone pay that cost
for our weird edge case.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_rtalloc.c |    3 +-
 fs/xfs/xfs_super.c   |   12 +++----
 3 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d516f3a35df36..0c54522404963 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1665,6 +1665,83 @@ xfs_reflink_adjust_bigalloc_len(
 # define xfs_reflink_adjust_bigalloc_len(...)		(0)
 #endif /* CONFIG_XFS_RT */
 
+/*
+ * Check the alignment of a remap request when the allocation unit size isn't a
+ * power of two.  The VFS helpers use (fast) bitmask-based alignment checks,
+ * but here we have to use slow long division.
+ */
+static int
+xfs_reflink_remap_check_rtalign(
+	struct xfs_inode		*ip_in,
+	loff_t				pos_in,
+	struct xfs_inode		*ip_out,
+	loff_t				pos_out,
+	loff_t				*req_len,
+	unsigned int			remap_flags)
+{
+	struct xfs_mount		*mp = ip_in->i_mount;
+	uint32_t			rextbytes;
+	loff_t				in_size, out_size;
+	loff_t				new_length, length = *req_len;
+	loff_t				blen;
+
+	rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	in_size = i_size_read(VFS_I(ip_in));
+	out_size = i_size_read(VFS_I(ip_out));
+
+	/* The start of both ranges must be aligned to a rt extent. */
+	if (!isaligned_64(pos_in, rextbytes) ||
+	    !isaligned_64(pos_out, rextbytes))
+		return -EINVAL;
+
+	if (length == 0)
+		length = in_size - pos_in;
+
+	/*
+	 * If the user wanted us to exchange up to the infile's EOF, round up
+	 * to the next block boundary for this check.
+	 *
+	 * Otherwise, reject the range length if it's not extent aligned.  We
+	 * already confirmed the starting offsets' extent alignment.
+	 */
+	if (pos_in + length == in_size)
+		blen = roundup_64(in_size, rextbytes) - pos_in;
+	else
+		blen = rounddown_64(length, rextbytes);
+
+	/* Don't allow overlapped remappings within the same file. */
+	if (ip_in == ip_out &&
+	    pos_out + blen > pos_in &&
+	    pos_in + blen > pos_out)
+		return -EINVAL;
+
+	/*
+	 * Ensure that we don't exchange a partial EOF extent into the middle
+	 * of another file.
+	 */
+	if (isaligned_64(length, rextbytes))
+		return 0;
+
+	new_length = length;
+	if (pos_out + length < out_size)
+		new_length = rounddown_64(new_length, rextbytes);
+
+	if (new_length == length)
+		return 0;
+
+	/*
+	 * Return the shortened request if the caller permits it.  If the
+	 * request was shortened to zero rt extents, we know that the original
+	 * arguments weren't valid in the first place.
+	 */
+	if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
+		*req_len = new_length;
+		return 0;
+	}
+
+	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1708,6 +1785,7 @@ xfs_reflink_remap_prep(
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	const struct iomap_ops	*dax_read_ops = NULL;
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(dest);
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1725,14 +1803,22 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+	/* Check non-power of two alignment issues, if necessary. */
+	if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
+		ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret)
+			goto out_unlock;
+
+		/* Do the VFS checks with the regular block alignment. */
+		alloc_unit = src->i_mount->m_sb.sb_blocksize;
+	}
 
 	if (IS_DAX(inode_in))
 		dax_read_ops = &xfs_read_iomap_ops;
 
 	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
-			pos_out, len, remap_flags, dax_read_ops,
-			xfs_inode_alloc_unitsize(dest));
+			pos_out, len, remap_flags, dax_read_ops, alloc_unit);
 	if (ret || *len == 0)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c617c326125b3..7917eaef911f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1253,7 +1253,8 @@ xfs_growfs_rt(
 		return -EOPNOTSUPP;
 	if (xfs_has_quota(mp))
 		return -EOPNOTSUPP;
-	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize))
+	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize) &&
+	    (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c17e1d06820d1..b5291b0ea21d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1734,17 +1734,17 @@ xfs_fs_fill_super(
 		 * state.  This means that we cannot dirty all the pages
 		 * backing an rt extent without dirtying the adjoining rt
 		 * extents.  If those rt extents are shared and extend into
-		 * other pages, this leads to crazy write amplification.  The
-		 * VFS remap_range checks assume power-of-two block sizes.
+		 * other pages, this leads to crazy write amplification.
 		 *
 		 * Hence we only support rt extent sizes that are an integer
-		 * power of two because we know those will align with the page
-		 * size.
+		 * power of two or an integer multiple of the page size because
+		 * we know those will align with the page size.
 		 */
 		if (xfs_has_realtime(mp) &&
-		    !is_power_of_2(mp->m_sb.sb_rextsize)) {
+		    !is_power_of_2(mp->m_sb.sb_rextsize) &&
+		    (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK)) {
 			xfs_alert(mp,
-	"reflink not compatible with non-power-of-2 realtime extent size %u!",
+	"reflink not compatible with realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;