diff mbox series

[v6,10/13] xfs: iomap COW-based atomic write support

Message ID 20250313171310.1886394-11-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series large atomic writes for xfs with CoW | expand

Commit Message

John Garry March 13, 2025, 5:13 p.m. UTC
In cases of an atomic write covering misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.

So, for that case, return -EAGAIN to request that the write be issued in
CoW atomic write mode. The dio write path should detect this, similar to
how misaligned regular DIO writes are handled.

For normal REQ_ATOMIC-based mode, when the range which we are atomic
writing to covers a shared data extent, try to allocate a new CoW fork.
However, if we find that what we allocated does not meet atomic write
requirements in terms of length and alignment, then fallback on the
CoW-based mode for the atomic write.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_iomap.h |   1 +
 2 files changed, 130 insertions(+), 2 deletions(-)

Comments

Ritesh Harjani (IBM) March 16, 2025, 6:53 a.m. UTC | #1
Hello,

John Garry <john.g.garry@oracle.com> writes:

> In cases of an atomic write covering misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.

Looks like the 1st time write to a given logical range of a file (e.g an
append write or writes on a hole), will also result into CoW based
fallback method, right?. More on that ask below. The commit msg should
capture that as well IMO.

>
> So, for that case, return -EAGAIN to request that the write be issued in
> CoW atomic write mode. The dio write path should detect this, similar to
> how misaligned regular DIO writes are handled.
>
> For normal REQ_ATOMIC-based mode, when the range which we are atomic
> writing to covers a shared data extent, try to allocate a new CoW fork.
> However, if we find that what we allocated does not meet atomic write
> requirements in terms of length and alignment, then fallback on the
> CoW-based mode for the atomic write.
>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_iomap.h |   1 +
>  2 files changed, 130 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 8196e66b099b..88d86cabb8a1 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -798,6 +798,23 @@ imap_spans_range(
>  	return true;
>  }
>  
> +static bool
> +xfs_bmap_valid_for_atomic_write(
> +	struct xfs_bmbt_irec	*imap,
> +	xfs_fileoff_t		offset_fsb,
> +	xfs_fileoff_t		end_fsb)
> +{
> +	/* Misaligned start block wrt size */
> +	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
> +		return false;
> +
> +	/* Discontiguous extents */
> +	if (!imap_spans_range(imap, offset_fsb, end_fsb))
> +		return false;
> +
> +	return true;
> +}
> +
>  static int
>  xfs_direct_write_iomap_begin(
>  	struct inode		*inode,
> @@ -812,10 +829,12 @@ xfs_direct_write_iomap_begin(
>  	struct xfs_bmbt_irec	imap, cmap;
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> +	xfs_fileoff_t		orig_end_fsb = end_fsb;
>  	int			nimaps = 1, error = 0;
>  	unsigned int		reflink_flags = 0;
>  	bool			shared = false;
>  	u16			iomap_flags = 0;
> +	bool			needs_alloc;
>  	unsigned int		lockmode;
>  	u64			seq;
>  
> @@ -877,13 +896,44 @@ xfs_direct_write_iomap_begin(
>  				&lockmode, reflink_flags);
>  		if (error)
>  			goto out_unlock;
> -		if (shared)
> +		if (shared) {
> +			/*
> +			 * Since we got a CoW fork extent mapping, ensure that
> +			 * the mapping is actually suitable for an
> +			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
> +			 * and covers the full range of the write. Otherwise,
> +			 * we need to use the COW-based atomic write mode.
> +			 */
> +			if ((flags & IOMAP_ATOMIC) &&
> +			    !xfs_bmap_valid_for_atomic_write(&cmap,
> +					offset_fsb, end_fsb)) {
> +				error = -EAGAIN;
> +				goto out_unlock;
> +			}
>  			goto out_found_cow;
> +		}
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>  	}
>  
> -	if (imap_needs_alloc(inode, flags, &imap, nimaps))
> +	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
> +
> +	if (flags & IOMAP_ATOMIC) {
> +		error = -EAGAIN;
> +		/*
> +		 * If we allocate less than what is required for the write
> +		 * then we may end up with multiple mappings, which means that
> +		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
> +		 */
> +		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
> +			goto out_unlock;

I have a quick question here. Based on above check it looks like
allocation requests on a hole or the 1st time allocation (append writes)
for a given logical range will always be done using CoW fallback
mechanism, isn't it? So that means HW based multi-fsblock atomic write
request will only happen for over writes (non-discontigous extent),
correct? 

Now, it's not always necessary that if we try to allocate an extent for
the given range, it results into discontiguous extents. e.g. say, if the
entire range being written to is a hole or append writes, then it might
just allocate a single unwritten extent which is valid for doing an
atomic write using HW/BIOs right? 
And it is valid to write using unwritten extent as long as we don't have
mixed mappings i.e. the entire range should either be unwritten or
written for the atomic write to be untorned, correct?

I am guessing this is kept intentional?

-ritesh

> +
> +		if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
> +				orig_end_fsb))
> +			goto out_unlock;
> +	}
> +
> +	if (needs_alloc)
>  		goto allocate_blocks;
>  
>  	/*
> @@ -1024,6 +1074,83 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
>  };
>  #endif /* CONFIG_XFS_RT */
>  
> +static int
> +xfs_atomic_write_cow_iomap_begin(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	loff_t			length,
> +	unsigned		flags,
> +	struct iomap		*iomap,
> +	struct iomap		*srcmap)
> +{
> +	ASSERT(flags & IOMAP_WRITE);
> +	ASSERT(flags & IOMAP_DIRECT);
> +
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_bmbt_irec	imap, cmap;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> +	int			nimaps = 1, error;
> +	bool			shared = false;
> +	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	u64			seq;
> +
> +	if (xfs_is_shutdown(mp))
> +		return -EIO;
> +
> +	if (!xfs_has_reflink(mp))
> +		return -EINVAL;
> +
> +	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> +	if (error)
> +		return error;
> +
> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> +			&nimaps, 0);
> +	if (error)
> +		goto out_unlock;
> +
> +	 /*
> +	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
> +	  * according to extszhint, such that there will be a greater chance
> +	  * that future atomic writes to that same range will be aligned (and
> +	  * don't require this COW-based method).
> +	  */
> +	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> +			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
> +			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
> +	/*
> +	 * Don't check @shared. For atomic writes, we should error when
> +	 * we don't get a COW fork extent mapping.
> +	 */
> +	if (error)
> +		goto out_unlock;
> +
> +	end_fsb = imap.br_startoff + imap.br_blockcount;
> +
> +	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> +	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> +	if (imap.br_startblock != HOLESTARTBLOCK) {
> +		seq = xfs_iomap_inode_sequence(ip, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
> +		if (error)
> +			goto out_unlock;
> +	}
> +	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
> +	xfs_iunlock(ip, lockmode);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
> +
> +out_unlock:
> +	if (lockmode)
> +		xfs_iunlock(ip, lockmode);
> +	return error;
> +}
> +
> +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
> +	.iomap_begin		= xfs_atomic_write_cow_iomap_begin,
> +};
> +
>  static int
>  xfs_dax_write_iomap_end(
>  	struct inode		*inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index d330c4a581b1..674f8ac1b9bd 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
>  extern const struct iomap_ops xfs_seek_iomap_ops;
>  extern const struct iomap_ops xfs_xattr_iomap_ops;
>  extern const struct iomap_ops xfs_dax_write_iomap_ops;
> +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
>  
>  #endif /* __XFS_IOMAP_H__*/
> -- 
> 2.31.1
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8196e66b099b..88d86cabb8a1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -798,6 +798,23 @@  imap_spans_range(
 	return true;
 }
 
+static bool
+xfs_bmap_valid_for_atomic_write(
+	struct xfs_bmbt_irec	*imap,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	/* Misaligned start block wrt size */
+	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+		return false;
+
+	/* Discontiguous extents */
+	if (!imap_spans_range(imap, offset_fsb, end_fsb))
+		return false;
+
+	return true;
+}
+
 static int
 xfs_direct_write_iomap_begin(
 	struct inode		*inode,
@@ -812,10 +829,12 @@  xfs_direct_write_iomap_begin(
 	struct xfs_bmbt_irec	imap, cmap;
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	xfs_fileoff_t		orig_end_fsb = end_fsb;
 	int			nimaps = 1, error = 0;
 	unsigned int		reflink_flags = 0;
 	bool			shared = false;
 	u16			iomap_flags = 0;
+	bool			needs_alloc;
 	unsigned int		lockmode;
 	u64			seq;
 
@@ -877,13 +896,44 @@  xfs_direct_write_iomap_begin(
 				&lockmode, reflink_flags);
 		if (error)
 			goto out_unlock;
-		if (shared)
+		if (shared) {
+			/*
+			 * Since we got a CoW fork extent mapping, ensure that
+			 * the mapping is actually suitable for an
+			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
+			 * and covers the full range of the write. Otherwise,
+			 * we need to use the COW-based atomic write mode.
+			 */
+			if ((flags & IOMAP_ATOMIC) &&
+			    !xfs_bmap_valid_for_atomic_write(&cmap,
+					offset_fsb, end_fsb)) {
+				error = -EAGAIN;
+				goto out_unlock;
+			}
 			goto out_found_cow;
+		}
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
 	}
 
-	if (imap_needs_alloc(inode, flags, &imap, nimaps))
+	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+	if (flags & IOMAP_ATOMIC) {
+		error = -EAGAIN;
+		/*
+		 * If we allocate less than what is required for the write
+		 * then we may end up with multiple mappings, which means that
+		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+		 */
+		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+			goto out_unlock;
+
+		if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
+				orig_end_fsb))
+			goto out_unlock;
+	}
+
+	if (needs_alloc)
 		goto allocate_blocks;
 
 	/*
@@ -1024,6 +1074,83 @@  const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
 };
 #endif /* CONFIG_XFS_RT */
 
+static int
+xfs_atomic_write_cow_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	ASSERT(flags & IOMAP_WRITE);
+	ASSERT(flags & IOMAP_DIRECT);
+
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap, cmap;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	int			nimaps = 1, error;
+	bool			shared = false;
+	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	u64			seq;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (!xfs_has_reflink(mp))
+		return -EINVAL;
+
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			&nimaps, 0);
+	if (error)
+		goto out_unlock;
+
+	 /*
+	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
+	  * according to extszhint, such that there will be a greater chance
+	  * that future atomic writes to that same range will be aligned (and
+	  * don't require this COW-based method).
+	  */
+	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
+			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
+	/*
+	 * Don't check @shared. For atomic writes, we should error when
+	 * we don't get a COW fork extent mapping.
+	 */
+	if (error)
+		goto out_unlock;
+
+	end_fsb = imap.br_startoff + imap.br_blockcount;
+
+	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+	if (imap.br_startblock != HOLESTARTBLOCK) {
+		seq = xfs_iomap_inode_sequence(ip, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+		if (error)
+			goto out_unlock;
+	}
+	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+	xfs_iunlock(ip, lockmode);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+	if (lockmode)
+		xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+	.iomap_begin		= xfs_atomic_write_cow_iomap_begin,
+};
+
 static int
 xfs_dax_write_iomap_end(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..674f8ac1b9bd 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,6 @@  extern const struct iomap_ops xfs_read_iomap_ops;
 extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/