From patchwork Sun Dec 31 21:56:18 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507758 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 34DE811189 for ; Sun, 31 Dec 2023 21:56:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="sQ2RzmTU" Received: by smtp.kernel.org (Postfix) with ESMTPSA id ADB58C433CA; Sun, 31 Dec 2023 21:56:18 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059778; bh=yFeOMuN68qYYPMEpSZyDLwjbySyE1VG7tN59Fa7ykSk=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=sQ2RzmTU7uPN9RxSavxOu7aeccWCK4KQ37gLumzjDqBlAugAzQOi28EpD2R84kb3f /J0elOjULm6a3TMW82w5f3qo/advE4A9ly9e+yrKS7gchkSduuZWCIz3jPLllYiRRN 1Xf7iJ+ERnN6R5/lk8SjsaFx5f2v96ZCyYqDrnR9HadjXr+eGDD0gBxlIi9XQhAJHB 0f95saHjIA+DZx+BGU6RsWfaoyp3gAgyNYaWVc7H47WyP84k+Tz7f0GfXBXTbYzrHD 15OFVNvTL7llUvpNWgUegn6f99sdMCI6St5M/0WmP4+k9HflD/cM++55f87BuTUlPR lQByHqqlNe2JQ== Date: Sun, 31 Dec 2023 13:56:18 -0800 Subject: [PATCH 1/9] vfs: explicitly pass the block size to the remap prep function From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852686.1767395.2701297777524398388.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Make it so that filesystems can pass an explicit blocksize to the remap prep function. This enables filesystems whose fundamental allocation units are /not/ the same as the blocksize to ensure that the remapping checks are aligned properly. Signed-off-by: Darrick J. Wong --- fs/dax.c | 5 ++++- fs/remap_range.c | 30 ++++++++++++++++++------------ include/linux/fs.h | 3 ++- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 3380b43cb6bbb..06b631eafb46c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -2061,7 +2061,10 @@ int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops) { + unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize; + return __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, ops); + pos_out, len, remap_flags, ops, + blocksize); } EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index f75ff15c94976..5d5b802f56086 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -30,18 +30,18 @@ */ static int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) + loff_t *req_count, unsigned int remap_flags, + unsigned int blocksize) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; uint64_t count = *req_count; uint64_t bcount; loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; int ret; /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + if (!IS_ALIGNED(pos_in, blocksize) || !IS_ALIGNED(pos_out, blocksize)) return -EINVAL; /* Ensure offsets don't wrap. */ @@ -75,10 +75,10 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, */ if (pos_in + count == size_in && (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { - bcount = ALIGN(size_in, bs) - pos_in; + bcount = ALIGN(size_in, blocksize) - pos_in; } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); + if (!IS_ALIGNED(count, blocksize)) + count = ALIGN_DOWN(count, blocksize); bcount = count; } @@ -128,9 +128,10 @@ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, loff_t *len, - unsigned int remap_flags) + unsigned int remap_flags, + unsigned int blocksize) { - u64 blkmask = i_blocksize(inode_in) - 1; + u64 blkmask = blocksize - 1; loff_t new_len = *len; if ((*len & blkmask) == 0) @@ -271,7 +272,8 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, - const struct iomap_ops *dax_read_ops) + const struct iomap_ops *dax_read_ops, + unsigned int blocksize) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -306,7 +308,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); + remap_flags, blocksize); if (ret || *len == 0) return ret; @@ -347,7 +349,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); + remap_flags, blocksize); if (ret || *len == 0) return ret; @@ -357,13 +359,17 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } +EXPORT_SYMBOL(__generic_remap_file_range_prep); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { + unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize; + return __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, NULL); + pos_out, len, remap_flags, NULL, + blocksize); } EXPORT_SYMBOL(generic_remap_file_range_prep); diff --git a/include/linux/fs.h b/include/linux/fs.h index d9badb63bfc29..aeead94a5b167 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2036,7 +2036,8 @@ int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, - const struct iomap_ops *dax_read_ops); + const struct iomap_ops *dax_read_ops, + unsigned int block_size); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); From patchwork Sun Dec 31 21:56:33 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507759 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97BFC1171B for ; Sun, 31 Dec 2023 21:56:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Iqc0Zclb" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6759BC433C7; Sun, 31 Dec 2023 21:56:34 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059794; bh=cA56i82jDHrYXEthv9I4eCGmXAAuHI7VU4dLWfEg4vk=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=Iqc0Zclb/fST+lAUbJ3xUduVENHjJxs5fYPF7AnzVqo6QGekH4RA9XTJTR4Uf9Wo/ GKeyEEoZUy1kQkMem4QkfIb/dvQULVPf4RsyEZJJFoF8i+87BWc4ALKrKumagKaQF9 3cxcsmQOzqnROk52yBcVGtLyUKp6fxGwa2mX7pglscknEBvjGWiDqzJR3oyBUYVmWg UI/dRIVZ/JzqDHYx/SDL4tLbvJlSKXy4Qm19CCSZzh0e4jTIRRXq7tNP1+B5Kwk6Io l3qMLXB7RLsijYEygRMnvUK72qxbAahDtR31x0BInd51heY2yBteVZCLV3+MtkiaMI HklbvyQmYJyVA== Date: Sun, 31 Dec 2023 13:56:33 -0800 Subject: [PATCH 2/9] xfs: enable CoW when rt extent size is larger than 1 block From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852703.1767395.4031402007268981201.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Copy on write encounters a major plot twist when the file being CoW'd lives on the realtime volume and the realtime extent size is larger than a single filesystem block. XFS can only unmap and remap full rt extents, which means that allocations are always done in units of full rt extents, and a request to unmap less than one extent is treated as a request to convert an extent to unwritten status. This behavioral quirk is not compatible with the existing CoW mechanism, so we have to intercept every path through which files can be modified to ensure that we dirty an entire rt extent at once so that we can remap a full rt extent. Use the existing VFS unshare functions to dirty the page cache to set that up. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_file.h | 3 + fs/xfs/xfs_inode.h | 6 + fs/xfs/xfs_iops.c | 22 +++++ fs/xfs/xfs_reflink.c | 39 +++++++++ fs/xfs/xfs_trace.h | 1 6 files changed, 293 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index fdbeb6c3fbc44..ebdda286cb2a2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -358,6 +358,116 @@ xfs_file_splice_read( return ret; } +/* + * Decide if this file write requires COWing-around at either end of the write + * range. This is only required if the file allocation unit is larger than + * 1FSB and the write range is not aligned with the allocation unit. + */ +static bool +xfs_file_write_needs_cow_around( + struct xfs_inode *ip, + loff_t pos, + long long int count) +{ + /* + * No COWing required if this inode doesn't do COW. + * + * If the allocation unit is 1FSB, we do not need to COW around the + * edges of the operation range. This applies to all files on the data + * device and rt files that have an extent size of 1FSB. + */ + if (!xfs_inode_needs_cow_around(ip)) + return false; + + /* + * Otherwise, check that the operation is aligned to the rt extent + * size. Any unaligned operation /must/ be COWed around since the + * regular reflink code only handles extending writes up to fsblock + * boundaries. + */ + return !xfs_is_falloc_aligned(ip, pos, count); +} + +/* Do we need to COW-around at this offset to handle a truncate up or down? */ +bool +xfs_truncate_needs_cow_around( + struct xfs_inode *ip, + loff_t pos) +{ + return xfs_file_write_needs_cow_around(ip, pos, 0); +} + +/* Does this file write require COWing around? */ +static inline bool +xfs_iocb_needs_cow_around( + struct xfs_inode *ip, + const struct kiocb *iocb, + const struct iov_iter *from) +{ + return xfs_file_write_needs_cow_around(ip, iocb->ki_pos, + iov_iter_count(from)); +} + +/* Unshare the allocation unit mapped to the given file position. */ +inline int +xfs_file_unshare_at( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + unsigned int extsize, len; + uint32_t mod; + + len = extsize = xfs_inode_alloc_unitsize(ip); + + /* Open-coded rounddown_64 so that we can skip out if aligned */ + div_u64_rem(pos, extsize, &mod); + if (mod == 0) + return 0; + pos -= mod; + + /* Do not extend the file. */ + if (pos >= isize) + return 0; + if (pos + len > isize) + len = isize - pos; + + trace_xfs_file_cow_around(ip, pos, len); + + if (IS_DAX(VFS_I(ip))) + return dax_file_unshare(VFS_I(ip), pos, len, + &xfs_dax_write_iomap_ops); + return iomap_file_unshare(VFS_I(ip), pos, len, + &xfs_buffered_write_iomap_ops); +} + +/* + * Dirty the pages on either side of a write request as needed to satisfy + * alignment requirements if we're going to perform a copy-write. + * + * This is only needed for realtime files when the rt extent size is larger + * than 1 fs block, because we don't allow a logical rt extent in a file to map + * to multiple physical rt extents. In other words, we can only map and unmap + * full rt extents. Note that page cache doesn't exist above EOF, so be + * careful to stay below EOF. + */ +static int +xfs_file_cow_around( + struct xfs_inode *ip, + loff_t pos, + long long int count) +{ + int error; + + /* Unshare at the start of the extent. */ + error = xfs_file_unshare_at(ip, pos); + if (error) + return error; + + /* Unshare at the end. */ + return xfs_file_unshare_at(ip, pos + count); +} + /* * Common pre-write limit and setup checks. * @@ -397,9 +507,10 @@ xfs_file_write_checks( /* * For changing security info in file_remove_privs() we need i_rwsem - * exclusively. + * exclusively. We also need it to COW around the range being written. */ - if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + if (*iolock == XFS_IOLOCK_SHARED && + (!IS_NOSEC(inode) || xfs_iocb_needs_cow_around(ip, iocb, from))) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); @@ -410,6 +521,22 @@ xfs_file_write_checks( goto restart; } + /* + * The write is not aligned to the file's allocation unit. If either + * of the allocation units at the start or end of the write range are + * shared, unshare them through the page cache. + */ + if (xfs_iocb_needs_cow_around(ip, iocb, from)) { + ASSERT(*iolock == XFS_IOLOCK_EXCL); + + inode_dio_wait(VFS_I(ip)); + drained_dio = true; + + error = xfs_file_cow_around(ip, iocb->ki_pos, count); + if (error) + return error; + } + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -461,6 +588,17 @@ xfs_file_write_checks( goto restart; } + /* + * If we're starting the write past EOF, COW the allocation + * unit containing the current EOF before we start zeroing the + * range between EOF and the start of the write. + */ + if (xfs_truncate_needs_cow_around(ip, isize)) { + error = xfs_file_unshare_at(ip, isize); + if (error) + return error; + } + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) @@ -575,6 +713,16 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; + /* + * If the range to write is not aligned to an allocation unit, we will + * have to COW the allocation units on both ends of the write. Because + * this runs through the page cache, it requires IOLOCK_EXCL. This + * predicate performs an unlocked access of the rt and reflink inode + * state. + */ + if (xfs_iocb_needs_cow_around(ip, iocb, from)) + iolock = XFS_IOLOCK_EXCL; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -927,6 +1075,13 @@ xfs_file_fallocate( goto out_unlock; if (mode & FALLOC_FL_PUNCH_HOLE) { + /* Unshare around the region to punch, if needed. */ + if (xfs_file_write_needs_cow_around(ip, offset, len)) { + error = xfs_file_cow_around(ip, offset, len); + if (error) + goto out_unlock; + } + error = xfs_free_file_space(ip, offset, len); if (error) goto out_unlock; @@ -997,6 +1152,13 @@ xfs_file_fallocate( trace_xfs_zero_file_space(ip); + /* Unshare around the region to zero, if needed. */ + if (xfs_file_write_needs_cow_around(ip, offset, len)) { + error = xfs_file_cow_around(ip, offset, len); + if (error) + goto out_unlock; + } + error = xfs_free_file_space(ip, offset, len); if (error) goto out_unlock; @@ -1005,6 +1167,26 @@ xfs_file_fallocate( round_down(offset, blksize); offset = round_down(offset, blksize); } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + /* + * Enlarge the unshare region to align to a full + * allocation unit. + */ + if (xfs_inode_needs_cow_around(ip)) { + loff_t isize = i_size_read(VFS_I(ip)); + unsigned int rextsize; + uint32_t mod; + + rextsize = xfs_inode_alloc_unitsize(ip); + div_u64_rem(offset, rextsize, &mod); + offset -= mod; + len += mod; + + div_u64_rem(offset + len, rextsize, &mod); + if (mod) + len += rextsize - mod; + if (offset + len > isize) + len = isize - offset; + } error = xfs_reflink_unshare(ip, offset, len); if (error) goto out_unlock; @@ -1272,6 +1454,34 @@ xfs_dax_fault( } #endif +static int +xfs_filemap_fault_around( + struct vm_fault *vmf, + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct folio *folio = page_folio(vmf->page); + loff_t pos; + ssize_t len; + + if (!xfs_inode_needs_cow_around(ip)) + return 0; + + folio_lock(folio); + len = folio_mkwrite_check_truncate(folio, inode); + if (len < 0) { + folio_unlock(folio); + return len; + } + pos = folio_pos(folio); + folio_unlock(folio); + + if (!xfs_file_write_needs_cow_around(ip, pos, len)) + return 0; + + return xfs_file_cow_around(XFS_I(inode), pos, len); +} + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1310,11 +1520,21 @@ __xfs_filemap_fault( if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, order, pfn); } else if (write_fault) { + /* + * Unshare all the blocks in this rt extent surrounding + * this page. + */ + int error = xfs_filemap_fault_around(vmf, inode); + if (error) { + ret = vmf_fs_error(error); + goto out_unlock; + } ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { ret = filemap_fault(vmf); } +out_unlock: if (lock_mode) xfs_iunlock(XFS_I(inode), lock_mode); diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h index 2ad91f755caf3..24490ea49e16c 100644 --- a/fs/xfs/xfs_file.h +++ b/fs/xfs/xfs_file.h @@ -12,4 +12,7 @@ extern const struct file_operations xfs_dir_file_operations; bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos, long long int len); +bool xfs_truncate_needs_cow_around(struct xfs_inode *ip, loff_t pos); +int xfs_file_unshare_at(struct xfs_inode *ip, loff_t pos); + #endif /* __XFS_FILE_H__ */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 6013a97d02c5d..df8197fe4cb82 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -331,6 +331,12 @@ static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip) return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } +/* Decide if we need to unshare the blocks around a range that we're writing. */ +static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip) +{ + return xfs_is_cow_inode(ip) && xfs_inode_has_bigallocunit(ip); +} + /* * Return the buftarg used for data allocations on a given inode. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 461e77dd54e38..71a4398fd36ac 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -27,6 +27,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_reflink.h" #include #include @@ -877,10 +878,31 @@ xfs_setattr_size( * truncate. */ if (newsize > oldsize) { + /* + * Extending the file size, so COW around the allocation unit + * containing EOF before we zero the new range of the file. + */ + if (xfs_truncate_needs_cow_around(ip, oldsize)) { + error = xfs_file_unshare_at(ip, oldsize); + if (error) + return error; + } + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, &did_zeroing); } else { + /* + * Truncating the file, so COW around the new EOF allocation + * unit before truncation zeroes the part of the EOF block + * after the new EOF. + */ + if (xfs_truncate_needs_cow_around(ip, newsize)) { + error = xfs_file_unshare_at(ip, newsize); + if (error) + return error; + } + /* * iomap won't detect a dirty page over an unwritten block (or a * cow block over a hole) and subsequently skips zeroing the diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index b0f3170c11c8b..d5773f9b7ec54 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -34,6 +34,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtgroup.h" #include "xfs_imeta.h" +#include "xfs_rtbitmap.h" /* * Copy on Write of Shared Blocks @@ -297,9 +298,26 @@ xfs_reflink_convert_cow_locked( struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; struct xfs_btree_cur *dummy_cur = NULL; + struct xfs_mount *mp = ip->i_mount; int dummy_logflags; int error = 0; + /* + * We can only remap full rt extents, so make sure that we convert the + * entire extent. The caller must ensure that this is either a direct + * write that's aligned to the rt extent size, or a buffered write for + * which we've dirtied extra pages to make this work properly. + */ + if (xfs_inode_needs_cow_around(ip)) { + xfs_fileoff_t new_off; + + new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb); + count_fsb += offset_fsb - new_off; + offset_fsb = new_off; + + count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb); + } + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; @@ -635,11 +653,21 @@ xfs_reflink_cancel_cow_blocks( bool cancel_real) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; bool isrt = XFS_IS_REALTIME_INODE(ip); int error = 0; + /* + * Shrink the range that we're cancelling if they don't align to the + * realtime extent size, since we can only free full extents. + */ + if (xfs_inode_needs_cow_around(ip)) { + offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb); + end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb); + } + if (!xfs_inode_has_cow_data(ip)) return 0; if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) @@ -946,6 +974,7 @@ xfs_reflink_end_cow( xfs_off_t offset, xfs_off_t count) { + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; int error = 0; @@ -955,6 +984,16 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + /* + * Make sure the end is aligned with a rt extent (if desired), since + * the end of the range could be EOF. The _convert_cow function should + * have set us up to swap only full rt extents. + */ + if (xfs_inode_needs_cow_around(ip)) { + offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb); + end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); + } + /* * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f94f144f9a39d..643cffaf3add2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3892,6 +3892,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); #ifdef CONFIG_XFS_RT DEFINE_SIMPLE_IO_EVENT(xfs_convert_bigalloc_file_space); From patchwork Sun Dec 31 21:56:49 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507760 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 362631171B for ; Sun, 31 Dec 2023 21:56:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ITq7QEzW" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 08DC3C433C8; Sun, 31 Dec 2023 21:56:49 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059810; bh=gZYA1weZfJ3HP/d7VEAjufvv33VpmlWVw4WZno1SNsY=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=ITq7QEzWGlIKHkZ6KCqDeAlyTeqhV8KYdmbJ4EV9mct6TYjLYb7i/UH7sKyUz4Dw1 KsNzANGa5pqlZyeVHg+7T+RjV+mxF3sI/jy5Vt6rqSC7Z6BBsVwtF7u6MJoetO8zv4 IzvgSn8u5h9NTsQYs6nvWXXwC3CDIFOEB5qQ1ZOqK2imgFXOuRsMifL6o/2Wbn2Pzf xF2HRcahtMW5IpRWQGQ12JNlkZwBwCFOr/iRSQSuGeSP7S+2sViytZ+tXPSAd9QfAu 5xySjwYOb8e9KGoLS4nxTHhhaqgi0+thIkfrytAjPjhW9oGZqBj/R4PK0m4UZqDO2R ffFZjSRlNNI2Q== Date: Sun, 31 Dec 2023 13:56:49 -0800 Subject: [PATCH 3/9] xfs: forcibly convert unwritten blocks within an rt extent before sharing From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852719.1767395.5382072798264653253.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong As noted in the previous patch, XFS can only unmap and map full rt extents. This means that we cannot stop mid-extent for any reason, including stepping around unwritten/written extents. Second, the reflink and CoW mechanisms were not designed to handle shared unwritten extents, so we have to do something to get rid of them. If the user asks us to remap two files, we must scan both ranges beforehand to convert any unwritten extents that are not aligned to rt extent boundaries into zeroed written extents before sharing. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d5773f9b7ec54..5d68603506f27 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1693,6 +1693,25 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + /* + * Now that we've marked both inodes for reflink, make sure that all + * allocation units (AU) mapped into either files' ranges are either + * wholly written, wholly unwritten, or holes. The bmap code requires + * that we align all unmap and remap requests to an AU. We've already + * flushed the page cache and finished directio for the range that's + * being remapped, so we can convert the mappings directly. + */ + if (xfs_inode_has_bigallocunit(src)) { + ret = xfs_convert_bigalloc_file_space(src, pos_in, *len); + if (ret) + goto out_unlock; + } + if (xfs_inode_has_bigallocunit(dest)) { + ret = xfs_convert_bigalloc_file_space(dest, pos_out, *len); + if (ret) + goto out_unlock; + } + /* * If pos_out > EOF, we may have dirtied blocks between EOF and * pos_out. In that case, we need to extend the flush and unmap to cover From patchwork Sun Dec 31 21:57:05 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507761 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DA76111704 for ; Sun, 31 Dec 2023 21:57:05 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="iQM578Bo" Received: by smtp.kernel.org (Postfix) with ESMTPSA id A9F1DC433C8; Sun, 31 Dec 2023 21:57:05 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059825; bh=vg76k9acyABRI+YhTu1zMJQdFknTjOSlaq0rncKsg04=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=iQM578BoCxjmo6MCYdVbSW+bmBV4K+Y7uhk7SFl4i/ZcrshJdTz6aX2H7Cy973qPS mr+n5fdnftj0r+v0skDDpO29QChKnDt4hJXJvZ0imdk68FMQQLs/qNbYUtvFNgEAEf w0CyLCtkkJvckDGyWFCgtKO1ZfXiyHDWlHC7jhQiuxO/RIR9qzeTCeLbCr81w6RTXD U8H5iuWyBitB6PIaNPGpguj3E4gCL5dxR6PTEpUQKftFZ8DRXXO3yY7A0eQUnK4idI /fHIjKvLOK2XIsuQYPPmBuNr23s4Q9gl7TtKbM1tX6AYQrD5Gp5ywRJsSLuDsIFxUI y6lA3qH0/OR1A== Date: Sun, 31 Dec 2023 13:57:05 -0800 Subject: [PATCH 4/9] xfs: add some tracepoints for writeback From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852735.1767395.8058627554300848243.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add a tracepoint so I can see where writeback is initiated. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 19 ++++++++++++------- fs/xfs/xfs_trace.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3001ddf48d6c6..1217ce197ad98 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -497,10 +497,11 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { STATIC int xfs_vm_writepages( - struct address_space *mapping, - struct writeback_control *wbc) + struct address_space *mapping, + struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); /* * Writing back data in a transaction context can result in recursive @@ -509,16 +510,20 @@ xfs_vm_writepages( if (WARN_ON_ONCE(current->journal_info)) return 0; - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + trace_xfs_vm_writepages(ip, wbc); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int xfs_dax_writepages( - struct address_space *mapping, - struct writeback_control *wbc) + struct address_space *mapping, + struct writeback_control *wbc) { - struct xfs_inode *ip = XFS_I(mapping->host); + struct xfs_inode *ip = XFS_I(mapping->host); + + trace_xfs_dax_writepages(ip, wbc); xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 643cffaf3add2..39df20ae702c8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1552,6 +1552,40 @@ DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IMAP_EVENT(xfs_iomap_alloc); DEFINE_IMAP_EVENT(xfs_iomap_found); +DECLARE_EVENT_CLASS(xfs_writeback_class, + TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), + TP_ARGS(ip, wbc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, range_start) + __field(loff_t, range_end) + __field(long, nr_to_write) + __field(enum writeback_sync_modes, sync_mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->range_start = wbc->range_start; + __entry->range_end = wbc->range_end; + __entry->nr_to_write = wbc->nr_to_write; + __entry->sync_mode = wbc->sync_mode; + ), + TP_printk("dev %d:%d ino 0x%llx range_start 0x%llx range_end 0x%llx nr_to_write %ld sync_mode %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->range_start, + __entry->range_end, + __entry->nr_to_write, + __entry->sync_mode) +); +#define DEFINE_WRITEBACK_EVENT(name) \ +DEFINE_EVENT(xfs_writeback_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \ + TP_ARGS(ip, wbc)) +DEFINE_WRITEBACK_EVENT(xfs_vm_writepages); +DEFINE_WRITEBACK_EVENT(xfs_dax_writepages); + DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count), TP_ARGS(ip, offset, count), From patchwork Sun Dec 31 21:57:20 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507762 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D575711704 for ; Sun, 31 Dec 2023 21:57:21 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KnzKGD/G" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 57E1DC433C8; Sun, 31 Dec 2023 21:57:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059841; bh=2p5a3yxnr9HTGuBjAk30UqelWBu9sZ+ILLpyRkjs1Js=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=KnzKGD/Gm5ptuisy7GWtDlhKky4IL2fEwePzl9wNiyfW6DrXtRbl1shjTIDxBGEti vRHyI60kuMk2e14bTCWUIzdNPGtJqMj0i0aL1KznJPzfSM2YnbLnl4ovFNzsk1TwXN cfkLoqT03wdQ62coxEI/Z0CG9O3YoFfay1BN8plVSEyUlvUIqLnpuscWAlG6Makdgl 6v+tugt3xNOReocRDXB6zMoFVlhGS9lkdeGgFxW/OmIW46GxTUYaWqiW631gb8g9ZJ 9gCbZsNW3nYpGHADi27ZXPo5kuEXS4b8eqLwnb7o88TwfEp53H03/xLsyq+EfzqcI8 cYxDOaPGj1O1w== Date: Sun, 31 Dec 2023 13:57:20 -0800 Subject: [PATCH 5/9] xfs: extend writeback requests to handle rt cow correctly From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852751.1767395.2816430586609993508.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong If we have shared realtime files and the rt extent size is larger than a single fs block, we need to extend writeback requests to be aligned to rt extent size granularity because we cannot share partial rt extents. The front end should have set us up for this by dirtying the relevant ranges. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 1 + 2 files changed, 39 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1217ce197ad98..b6ef76ee65f5e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -495,6 +495,38 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { .discard_folio = xfs_discard_folio, }; +/* + * Extend the writeback range to allocation unit granularity and alignment. + * This is a requirement for blocksize > pagesize scenarios such as realtime + * copy on write, since we can only share full rt extents. + */ +static inline void +xfs_vm_writepages_extend( + struct xfs_inode *ip, + struct writeback_control *wbc) +{ + unsigned int bsize = xfs_inode_alloc_unitsize(ip); + long long int pages_to_write; + loff_t next = wbc->range_end + 1; + + wbc->range_start = rounddown_64(wbc->range_start, bsize); + if (wbc->range_end != LLONG_MAX) + wbc->range_end = roundup_64(next, bsize) - 1; + + if (wbc->nr_to_write != LONG_MAX) { + pgoff_t pg_start = wbc->range_start >> PAGE_SHIFT; + pgoff_t pg_next = (wbc->range_end + 1) >> PAGE_SHIFT; + + pages_to_write = pg_next - pg_start; + if (pages_to_write >= LONG_MAX) + pages_to_write = LONG_MAX; + if (wbc->nr_to_write < pages_to_write) + wbc->nr_to_write = pages_to_write; + } + + trace_xfs_vm_writepages_extend(ip, wbc); +} + STATIC int xfs_vm_writepages( struct address_space *mapping, @@ -512,6 +544,9 @@ xfs_vm_writepages( trace_xfs_vm_writepages(ip, wbc); + if (xfs_inode_needs_cow_around(ip)) + xfs_vm_writepages_extend(ip, wbc); + xfs_iflags_clear(ip, XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -525,6 +560,9 @@ xfs_dax_writepages( trace_xfs_dax_writepages(ip, wbc); + if (xfs_inode_needs_cow_around(ip)) + xfs_vm_writepages_extend(ip, wbc); + xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, xfs_inode_buftarg(ip)->bt_daxdev, wbc); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 39df20ae702c8..4767fc49c4641 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1584,6 +1584,7 @@ DEFINE_EVENT(xfs_writeback_class, name, \ TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \ TP_ARGS(ip, wbc)) DEFINE_WRITEBACK_EVENT(xfs_vm_writepages); +DEFINE_WRITEBACK_EVENT(xfs_vm_writepages_extend); DEFINE_WRITEBACK_EVENT(xfs_dax_writepages); DECLARE_EVENT_CLASS(xfs_simple_io_class, From patchwork Sun Dec 31 21:57:36 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507763 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 25A5D11704 for ; Sun, 31 Dec 2023 21:57:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="gY6nJU5b" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E5A14C433C8; Sun, 31 Dec 2023 21:57:36 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059857; bh=5Rr9JlF3EjsLQbhtb33DDUNhRGF5Q7iAcUYO9H9g3ow=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=gY6nJU5bfxdx59/YKspvRGazZ60M1OBhd0NzahSgXRLyZwV1awPesqIdwpjXyAM0O RIpVZfqsxRTDiaqrFy1FwV0C+unzkyFfqQBfijy730Yq4LmhDpOjmfPOcTa7fT8Vyn S2592wVMcbU+Ela4c9FTfw/O+V1wKsQbwdGWCjX+ZrKlpXcY0pTuMIi73nC+5Wlk/v lG0/HULmIUu+OUrcXqbqHMKIHYci75rAWyWUOPLHLp+fvEmoxlDAEuof3v6NxSqE7O 17Jf9y0kzC4BeN8utcFvK78Yh9+Z7cYiQi1wzqlUAdbAhTFZtK4IG0Fsk/6tc5Bfhi /pXu1BeCrfqkw== Date: Sun, 31 Dec 2023 13:57:36 -0800 Subject: [PATCH 6/9] xfs: enable extent size hints for CoW when rtextsize > 1 From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852767.1767395.6394408204682530311.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong CoW extent size hints are not allowed on filesystems that have large realtime extents because we only want to perform the minimum required amount of write-around (aka write amplification) for shared extents. On filesystems where rtextsize > 1, allocations can only be done in units of full rt extents, which means that we can only map an entire rt extent's worth of blocks into the data fork. Hole punch requests become conversions to unwritten if the request isn't aligned properly. Because a copy-write fundamentally requires remapping, this means that we also can only do copy-writes of a full rt extent. This is too expensive for large hint sizes, since it's all or nothing. Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 41354bdbbc90f..c6ab6a38bc7cd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6408,6 +6408,28 @@ xfs_get_cowextsz_hint( if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) a = ip->i_cowextsize; if (XFS_IS_REALTIME_INODE(ip)) { + /* + * For realtime files, the realtime extent is the fundamental + * unit of allocation. This means that data sharing and CoW + * remapping can only be done in those units. For filesystems + * where the extent size is larger than one block, write + * requests that are not aligned to an extent boundary employ + * an unshare-around strategy to ensure that all pages for a + * shared extent are fully dirtied. + * + * Because the remapping alignment requirement applies equally + * to all CoW writes, any regular overwrites that could be + * turned (by a speculative CoW preallocation) into a CoW write + * must either employ this dirty-around strategy, or be smart + * enough to ignore the CoW fork mapping unless the entire + * extent is dirty or becomes shared by writeback time. Doing + * the first would dramatically increase write amplification, + * and the second would require deeper insight into the state + * of the page cache during a writeback request. For now, we + * ignore the hint. + */ + if (ip->i_mount->m_sb.sb_rextsize > 1) + return ip->i_mount->m_sb.sb_rextsize; b = 0; if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) b = ip->i_extsize; From patchwork Sun Dec 31 21:57:52 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507764 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D1A6C11719 for ; Sun, 31 Dec 2023 21:57:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KDxIVo/E" Received: by smtp.kernel.org (Postfix) with ESMTPSA id A2D31C433C7; Sun, 31 Dec 2023 21:57:52 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059872; bh=JTrp0euvZhFwTlDQTknBtC435Kc/a8OpWqmBPFVOONI=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=KDxIVo/E+KTDgjgIb8hBrnaB/iKPoxQXMD47zrxLLga7CGJJqtIzMgkpCyJgd9NwI pLl2UWxBr9xz2Tif663r/ZLtR4yBsVB1NXqxpLdEvpDj3kk7zhVFWZIwcsr93o8R6p lKZ5DR5J4sn24teer/gmmGhF5NrEdEXm+1Ds1ILkJz+GOaYmnete9nAfhAO1Q7957c 6xcJGybLQoKizU75isttunlivjsAohZyQjkfXWPKCVV43Xtyu3tdnBLVr/cQmn1FKr Xa8+vs+8PwpBCQzKWmNb9Ypm/BC+us8g6ucjbH21N3VeIX4lKFywcPg/oNEQGKDqGY 6HWcv7oJBGZwQ== Date: Sun, 31 Dec 2023 13:57:52 -0800 Subject: [PATCH 7/9] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852783.1767395.4576654556100736347.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Make the necessary tweaks to the reflink remapping code to support remapping on the realtime volume when the rt extent size is larger than a single rt block. We need to check that the remap arguments from userspace are aligned to a rt extent boundary, and that the length is always aligned, even if the kernel tried to round it up to EOF for us. XFS can only map and remap full rt extents, so we have to be a little more strict about the alignment there. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/xfs/xfs_rtalloc.c | 2 + fs/xfs/xfs_super.c | 19 +++++++++--- fs/xfs/xfs_trace.h | 3 ++ 4 files changed, 93 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5d68603506f27..d516f3a35df36 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1534,6 +1534,13 @@ xfs_reflink_remap_blocks( len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), XFS_MAX_FILEOFF); + /* + * Make sure the end is aligned with an allocation unit, even if it's + * past EOF. + */ + if (xfs_inode_has_bigallocunit(dest)) + len = xfs_rtb_roundup_rtx(mp, len); + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); while (len > 0) { @@ -1607,6 +1614,57 @@ xfs_reflink_zero_posteof( return xfs_zero_range(ip, isize, pos - isize, NULL); } +#ifdef CONFIG_XFS_RT +/* + * Adjust the length of the remap operation to end on an allocation unit (AU) + * boundary. + */ +STATIC int +xfs_reflink_adjust_bigalloc_len( + struct xfs_inode *src, + loff_t pos_in, + struct xfs_inode *dest, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + unsigned int alloc_unit = xfs_inode_alloc_unitsize(src); + uint32_t mod; + + div_u64_rem(*len, alloc_unit, &mod); + + /* + * We previously checked the AU alignment of both offsets, so we now + * have to check the AU alignment of the length. The VFS remap prep + * function can change the length on us, so we can only make length + * adjustments after that. If the length is aligned to an AU, we're + * good to go. + * + * Otherwise, the length is not aligned to an AU. If the source file's + * range ends at EOF, the VFS ensured that the dest file's range also + * ends at EOF. The actual remap function will round the (byte) length + * up to the nearest AU, so we're ok here too. + */ + if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src))) + return 0; + + /* + * Otherwise, the only thing we can do is round the request length down + * to an AU boundary. If the caller doesn't allow that, we cannot move + * forward. + */ + if (!(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + /* Back off by a single extent. */ + (*len) -= mod; + trace_xfs_reflink_adjust_bigalloc_len(src, pos_in, *len, dest, pos_out); + return 0; +} +#else +# define xfs_reflink_adjust_bigalloc_len(...) (0) +#endif /* CONFIG_XFS_RT */ + /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file will @@ -1649,6 +1707,7 @@ xfs_reflink_remap_prep( struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); + const struct iomap_ops *dax_read_ops = NULL; int ret; /* Lock both files against IO */ @@ -1666,15 +1725,25 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - if (!IS_DAX(inode_in)) - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags); - else - ret = dax_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, &xfs_read_iomap_ops); + ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest))); + + if (IS_DAX(inode_in)) + dax_read_ops = &xfs_read_iomap_ops; + + ret = __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, dax_read_ops, + xfs_inode_alloc_unitsize(dest)); if (ret || *len == 0) goto out_unlock; + /* Adjust the end to align to an allocation unit. */ + if (xfs_inode_has_bigallocunit(src)) { + ret = xfs_reflink_adjust_bigalloc_len(src, pos_in, dest, + pos_out, len, remap_flags); + if (ret || *len == 0) + goto out_unlock; + } + /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 11b6645a5a534..c617c326125b3 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1253,7 +1253,7 @@ xfs_growfs_rt( return -EOPNOTSUPP; if (xfs_has_quota(mp)) return -EOPNOTSUPP; - if (xfs_has_reflink(mp) && in->extsize != 1) + if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize)) return -EOPNOTSUPP; nrblocks = in->newblocks; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bf0c0ce9a54b9..c17e1d06820d1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1729,13 +1729,22 @@ xfs_fs_fill_super( if (xfs_has_reflink(mp)) { /* - * Reflink doesn't support rt extent sizes larger than a single - * block because we would have to perform unshare-around for - * rtext-unaligned write requests. + * Reflink doesn't support pagecache pages that span multiple + * realtime extents because iomap doesn't track subpage dirty + * state. This means that we cannot dirty all the pages + * backing an rt extent without dirtying the adjoining rt + * extents. If those rt extents are shared and extend into + * other pages, this leads to crazy write amplification. The + * VFS remap_range checks assume power-of-two block sizes. + * + * Hence we only support rt extent sizes that are an integer + * power of two because we know those will align with the page + * size. */ - if (xfs_has_realtime(mp) && mp->m_sb.sb_rextsize != 1) { + if (xfs_has_realtime(mp) && + !is_power_of_2(mp->m_sb.sb_rextsize)) { xfs_alert(mp, - "reflink not compatible with realtime extent size %u!", + "reflink not compatible with non-power-of-2 realtime extent size %u!", mp->m_sb.sb_rextsize); error = -EINVAL; goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4767fc49c4641..906e35eef223d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3887,6 +3887,9 @@ TRACE_EVENT(xfs_reflink_remap_blocks, __entry->dest_lblk) ); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +#ifdef CONFIG_XFS_RT +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_adjust_bigalloc_len); +#endif /* CONFIG_XFS_RT */ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); From patchwork Sun Dec 31 21:58:07 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507765 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D52A211704 for ; Sun, 31 Dec 2023 21:58:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="RyUSv/e/" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 49786C433C7; Sun, 31 Dec 2023 21:58:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059888; bh=q8OdXz0+so6B6GeDCKtdbJYvElax7dazHtawKdCqHuw=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=RyUSv/e/zopxH1cch1RoJF6V5Ba5b2Wd+inSKAti8leU1bgKDdHHGct+l6x/uNjtr ynNSclKQ2/Jsi4tvB7it46Ybw/1zfefaFNNl1IifHGeeoWcSZtRJif0wl21O8mNK4H Lrsnkq1tJyMiOW679j83e5eVPPa7EGtcTAwYq/jGfIE/t+3zRj4JPj0dlgYm57Czei TZvC6Kskn0jCnltQx/zzOXAtJLOiUcYfADunXt/oQ+1ea0H+O0yC1rJ3enG1zUQwqR 8xoOvICsnbjZRso4N/lbA8SNV2NVPwoA7CyFZSXE0kdgnDTDkmKcYI14/LuTnFxxPC 2C50fEZi2uVfA== Date: Sun, 31 Dec 2023 13:58:07 -0800 Subject: [PATCH 8/9] xfs: fix integer overflow when validating extent size hints From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852799.1767395.13164066067840149369.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Both file extent size hints are stored as 32-bit quantities, in units of filesystem blocks. As part of validating the hints, we convert these quantities to bytes to ensure that the hint is congruent with the file's allocation size. The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN). If the file allocation unit is larger than 2048, the unit conversion will exceed 32 bits in size, which overflows the uint32_t used to store the value used in the comparison. This isn't a problem for files on the data device since the hint will always be a multiple of the block size. However, this is a problem for realtime files because the rtextent size can be any integer number of fs blocks, and truncation of upper bits changes the outcome of division. Eliminate the overflow by performing the congruency check in units of blocks, not bytes. Otherwise, we get errors like this: $ truncate -s 500T /tmp/a $ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7. Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_buf.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 81a12ca8ec434..adc457da52ef0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -770,13 +770,11 @@ xfs_inode_validate_extsize( bool rt_flag; bool hint_flag; bool inherit_flag; - uint32_t extsize_bytes; - uint32_t blocksize_bytes; + uint32_t alloc_unit = 1; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags & XFS_DIFLAG_EXTSIZE); inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); - extsize_bytes = XFS_FSB_TO_B(mp, extsize); /* * This comment describes a historic gap in this verifier function. @@ -805,9 +803,7 @@ xfs_inode_validate_extsize( */ if (rt_flag) - blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - else - blocksize_bytes = mp->m_sb.sb_blocksize; + alloc_unit = mp->m_sb.sb_rextsize; if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) return __this_address; @@ -825,7 +821,7 @@ xfs_inode_validate_extsize( if (mode && !(hint_flag || inherit_flag) && extsize != 0) return __this_address; - if (extsize_bytes % blocksize_bytes) + if (extsize % alloc_unit) return __this_address; if (extsize > XFS_MAX_BMBT_EXTLEN) @@ -860,12 +856,10 @@ xfs_inode_validate_cowextsize( { bool rt_flag; bool hint_flag; - uint32_t cowextsize_bytes; - uint32_t blocksize_bytes; + uint32_t alloc_unit = 1; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); - cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); /* * Similar to extent size hints, a directory can be configured to @@ -880,9 +874,7 @@ xfs_inode_validate_cowextsize( */ if (rt_flag) - blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - else - blocksize_bytes = mp->m_sb.sb_blocksize; + alloc_unit = mp->m_sb.sb_rextsize; if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -897,7 +889,7 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (cowextsize_bytes % blocksize_bytes) + if (cowextsize % alloc_unit) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) From patchwork Sun Dec 31 21:58:23 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507766 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19359BA43 for ; Sun, 31 Dec 2023 21:58:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="OM/+kaHs" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E006EC433C7; Sun, 31 Dec 2023 21:58:23 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704059903; bh=I/yVVacjB0X38rOipjRkfFkp0EjKgmJTpfQfV4GkDgg=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=OM/+kaHsUaNjkODcQsOwwIoBB3XBBCIehtCYKLIP/Aux/0JiaXcsltHx5xyK4Xvom 3Kx3bsEi1+wdgzvKiTeoyVEzIuas9TQ2Be9Ql2Y4a4KxLIigWU0M09bYumdkXhQsqE 7XsMKzosGplhahQaUueL5HkMbmr1tZWE3cKkH4OEin1cxfYGdHMhqNJUYt5pUnGWqX XD8fZHICZiZmbfRYblZEkMxOjEnzZLcCYGHN6GA78ikMAdxy2G380hr3TC5sw+uMs7 1Beim+XwMvBkc++jEFlFQeXTW39Pz1VNmPNyzDtqibylvI88nPiYMAFOTyiUUsC0uV Qte6DHy/czu/Q== Date: Sun, 31 Dec 2023 13:58:23 -0800 Subject: [PATCH 9/9] xfs: support realtime reflink with an extent size that isn't a power of 2 From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404852815.1767395.842469542235107122.stgit@frogsfrogsfrogs> In-Reply-To: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> References: <170404852650.1767395.17654728220580066333.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add the necessary alignment checking code to the reflink remap code to ensure that remap requests are aligned to rt extent boundaries if the realtime extent size isn't a power of two. The VFS helpers assume that they can use the usual (blocksize - 1) masking to avoid slow 64-bit division, but since XFS is special we won't make everyone pay that cost for our weird edge case. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_rtalloc.c | 3 +- fs/xfs/xfs_super.c | 12 +++---- 3 files changed, 97 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d516f3a35df36..0c54522404963 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1665,6 +1665,83 @@ xfs_reflink_adjust_bigalloc_len( # define xfs_reflink_adjust_bigalloc_len(...) (0) #endif /* CONFIG_XFS_RT */ +/* + * Check the alignment of a remap request when the allocation unit size isn't a + * power of two. The VFS helpers use (fast) bitmask-based alignment checks, + * but here we have to use slow long division. + */ +static int +xfs_reflink_remap_check_rtalign( + struct xfs_inode *ip_in, + loff_t pos_in, + struct xfs_inode *ip_out, + loff_t pos_out, + loff_t *req_len, + unsigned int remap_flags) +{ + struct xfs_mount *mp = ip_in->i_mount; + uint32_t rextbytes; + loff_t in_size, out_size; + loff_t new_length, length = *req_len; + loff_t blen; + + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + in_size = i_size_read(VFS_I(ip_in)); + out_size = i_size_read(VFS_I(ip_out)); + + /* The start of both ranges must be aligned to a rt extent. */ + if (!isaligned_64(pos_in, rextbytes) || + !isaligned_64(pos_out, rextbytes)) + return -EINVAL; + + if (length == 0) + length = in_size - pos_in; + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next block boundary for this check. + * + * Otherwise, reject the range length if it's not extent aligned. We + * already confirmed the starting offsets' extent alignment. + */ + if (pos_in + length == in_size) + blen = roundup_64(in_size, rextbytes) - pos_in; + else + blen = rounddown_64(length, rextbytes); + + /* Don't allow overlapped remappings within the same file. */ + if (ip_in == ip_out && + pos_out + blen > pos_in && + pos_in + blen > pos_out) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF extent into the middle + * of another file. + */ + if (isaligned_64(length, rextbytes)) + return 0; + + new_length = length; + if (pos_out + length < out_size) + new_length = rounddown_64(new_length, rextbytes); + + if (new_length == length) + return 0; + + /* + * Return the shortened request if the caller permits it. If the + * request was shortened to zero rt extents, we know that the original + * arguments weren't valid in the first place. + */ + if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) { + *req_len = new_length; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file will @@ -1708,6 +1785,7 @@ xfs_reflink_remap_prep( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); const struct iomap_ops *dax_read_ops = NULL; + unsigned int alloc_unit = xfs_inode_alloc_unitsize(dest); int ret; /* Lock both files against IO */ @@ -1725,14 +1803,22 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest))); + /* Check non-power of two alignment issues, if necessary. */ + if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) { + ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest, + pos_out, len, remap_flags); + if (ret) + goto out_unlock; + + /* Do the VFS checks with the regular block alignment. */ + alloc_unit = src->i_mount->m_sb.sb_blocksize; + } if (IS_DAX(inode_in)) dax_read_ops = &xfs_read_iomap_ops; ret = __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, dax_read_ops, - xfs_inode_alloc_unitsize(dest)); + pos_out, len, remap_flags, dax_read_ops, alloc_unit); if (ret || *len == 0) goto out_unlock; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c617c326125b3..7917eaef911f6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1253,7 +1253,8 @@ xfs_growfs_rt( return -EOPNOTSUPP; if (xfs_has_quota(mp)) return -EOPNOTSUPP; - if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize)) + if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize) && + (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK)) return -EOPNOTSUPP; nrblocks = in->newblocks; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c17e1d06820d1..b5291b0ea21d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1734,17 +1734,17 @@ xfs_fs_fill_super( * state. This means that we cannot dirty all the pages * backing an rt extent without dirtying the adjoining rt * extents. If those rt extents are shared and extend into - * other pages, this leads to crazy write amplification. The - * VFS remap_range checks assume power-of-two block sizes. + * other pages, this leads to crazy write amplification. * * Hence we only support rt extent sizes that are an integer - * power of two because we know those will align with the page - * size. + * power of two or an integer multiple of the page size because + * we know those will align with the page size. */ if (xfs_has_realtime(mp) && - !is_power_of_2(mp->m_sb.sb_rextsize)) { + !is_power_of_2(mp->m_sb.sb_rextsize) && + (XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) & ~PAGE_MASK)) { xfs_alert(mp, - "reflink not compatible with non-power-of-2 realtime extent size %u!", + "reflink not compatible with realtime extent size %u!", mp->m_sb.sb_rextsize); error = -EINVAL; goto out_filestream_unmount;