diff mbox series

[v6,10/13] xfs: iomap COW-based atomic write support

Message ID 20250313171310.1886394-11-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series large atomic writes for xfs with CoW | expand

Commit Message

John Garry March 13, 2025, 5:13 p.m. UTC
In cases of an atomic write covering misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.

So, for that case, return -EAGAIN to request that the write be issued in
CoW atomic write mode. The dio write path should detect this, similar to
how misaligned regular DIO writes are handled.

For normal REQ_ATOMIC-based mode, when the range which we are atomic
writing to covers a shared data extent, try to allocate a new CoW fork.
However, if we find that what we allocated does not meet atomic write
requirements in terms of length and alignment, then fallback on the
CoW-based mode for the atomic write.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_iomap.h |   1 +
 2 files changed, 130 insertions(+), 2 deletions(-)

Comments

Ritesh Harjani (IBM) March 16, 2025, 6:53 a.m. UTC | #1
Hello,

John Garry <john.g.garry@oracle.com> writes:

> In cases of an atomic write covering misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.

Looks like the 1st time write to a given logical range of a file (e.g an
append write or writes on a hole), will also result into CoW based
fallback method, right?. More on that ask below. The commit msg should
capture that as well IMO.

>
> So, for that case, return -EAGAIN to request that the write be issued in
> CoW atomic write mode. The dio write path should detect this, similar to
> how misaligned regular DIO writes are handled.
>
> For normal REQ_ATOMIC-based mode, when the range which we are atomic
> writing to covers a shared data extent, try to allocate a new CoW fork.
> However, if we find that what we allocated does not meet atomic write
> requirements in terms of length and alignment, then fallback on the
> CoW-based mode for the atomic write.
>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_iomap.h |   1 +
>  2 files changed, 130 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 8196e66b099b..88d86cabb8a1 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -798,6 +798,23 @@ imap_spans_range(
>  	return true;
>  }
>  
> +static bool
> +xfs_bmap_valid_for_atomic_write(
> +	struct xfs_bmbt_irec	*imap,
> +	xfs_fileoff_t		offset_fsb,
> +	xfs_fileoff_t		end_fsb)
> +{
> +	/* Misaligned start block wrt size */
> +	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
> +		return false;
> +
> +	/* Discontiguous extents */
> +	if (!imap_spans_range(imap, offset_fsb, end_fsb))
> +		return false;
> +
> +	return true;
> +}
> +
>  static int
>  xfs_direct_write_iomap_begin(
>  	struct inode		*inode,
> @@ -812,10 +829,12 @@ xfs_direct_write_iomap_begin(
>  	struct xfs_bmbt_irec	imap, cmap;
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> +	xfs_fileoff_t		orig_end_fsb = end_fsb;
>  	int			nimaps = 1, error = 0;
>  	unsigned int		reflink_flags = 0;
>  	bool			shared = false;
>  	u16			iomap_flags = 0;
> +	bool			needs_alloc;
>  	unsigned int		lockmode;
>  	u64			seq;
>  
> @@ -877,13 +896,44 @@ xfs_direct_write_iomap_begin(
>  				&lockmode, reflink_flags);
>  		if (error)
>  			goto out_unlock;
> -		if (shared)
> +		if (shared) {
> +			/*
> +			 * Since we got a CoW fork extent mapping, ensure that
> +			 * the mapping is actually suitable for an
> +			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
> +			 * and covers the full range of the write. Otherwise,
> +			 * we need to use the COW-based atomic write mode.
> +			 */
> +			if ((flags & IOMAP_ATOMIC) &&
> +			    !xfs_bmap_valid_for_atomic_write(&cmap,
> +					offset_fsb, end_fsb)) {
> +				error = -EAGAIN;
> +				goto out_unlock;
> +			}
>  			goto out_found_cow;
> +		}
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>  	}
>  
> -	if (imap_needs_alloc(inode, flags, &imap, nimaps))
> +	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
> +
> +	if (flags & IOMAP_ATOMIC) {
> +		error = -EAGAIN;
> +		/*
> +		 * If we allocate less than what is required for the write
> +		 * then we may end up with multiple mappings, which means that
> +		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
> +		 */
> +		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
> +			goto out_unlock;

I have a quick question here. Based on above check it looks like
allocation requests on a hole or the 1st time allocation (append writes)
for a given logical range will always be done using CoW fallback
mechanism, isn't it? So that means HW based multi-fsblock atomic write
request will only happen for over writes (non-discontigous extent),
correct? 

Now, it's not always necessary that if we try to allocate an extent for
the given range, it results into discontiguous extents. e.g. say, if the
entire range being written to is a hole or append writes, then it might
just allocate a single unwritten extent which is valid for doing an
atomic write using HW/BIOs right? 
And it is valid to write using unwritten extent as long as we don't have
mixed mappings i.e. the entire range should either be unwritten or
written for the atomic write to be untorned, correct?

I am guessing this is kept intentional?

-ritesh

> +
> +		if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
> +				orig_end_fsb))
> +			goto out_unlock;
> +	}
> +
> +	if (needs_alloc)
>  		goto allocate_blocks;
>  
>  	/*
> @@ -1024,6 +1074,83 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
>  };
>  #endif /* CONFIG_XFS_RT */
>  
> +static int
> +xfs_atomic_write_cow_iomap_begin(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	loff_t			length,
> +	unsigned		flags,
> +	struct iomap		*iomap,
> +	struct iomap		*srcmap)
> +{
> +	ASSERT(flags & IOMAP_WRITE);
> +	ASSERT(flags & IOMAP_DIRECT);
> +
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_bmbt_irec	imap, cmap;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> +	int			nimaps = 1, error;
> +	bool			shared = false;
> +	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	u64			seq;
> +
> +	if (xfs_is_shutdown(mp))
> +		return -EIO;
> +
> +	if (!xfs_has_reflink(mp))
> +		return -EINVAL;
> +
> +	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> +	if (error)
> +		return error;
> +
> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> +			&nimaps, 0);
> +	if (error)
> +		goto out_unlock;
> +
> +	 /*
> +	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
> +	  * according to extszhint, such that there will be a greater chance
> +	  * that future atomic writes to that same range will be aligned (and
> +	  * don't require this COW-based method).
> +	  */
> +	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> +			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
> +			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
> +	/*
> +	 * Don't check @shared. For atomic writes, we should error when
> +	 * we don't get a COW fork extent mapping.
> +	 */
> +	if (error)
> +		goto out_unlock;
> +
> +	end_fsb = imap.br_startoff + imap.br_blockcount;
> +
> +	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> +	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> +	if (imap.br_startblock != HOLESTARTBLOCK) {
> +		seq = xfs_iomap_inode_sequence(ip, 0);
> +		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
> +		if (error)
> +			goto out_unlock;
> +	}
> +	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
> +	xfs_iunlock(ip, lockmode);
> +	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
> +
> +out_unlock:
> +	if (lockmode)
> +		xfs_iunlock(ip, lockmode);
> +	return error;
> +}
> +
> +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
> +	.iomap_begin		= xfs_atomic_write_cow_iomap_begin,
> +};
> +
>  static int
>  xfs_dax_write_iomap_end(
>  	struct inode		*inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index d330c4a581b1..674f8ac1b9bd 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
>  extern const struct iomap_ops xfs_seek_iomap_ops;
>  extern const struct iomap_ops xfs_xattr_iomap_ops;
>  extern const struct iomap_ops xfs_dax_write_iomap_ops;
> +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
>  
>  #endif /* __XFS_IOMAP_H__*/
> -- 
> 2.31.1
Christoph Hellwig March 17, 2025, 7:26 a.m. UTC | #2
> +static bool
> +xfs_bmap_valid_for_atomic_write(

This is misnamed.  It checks if the hardware offload an be used.

> +	/* Misaligned start block wrt size */
> +	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
> +		return false;

It is very obvious that this checks for a natural alignment of the
block number.  But it fails to explain why it checks for that.

> +
> +	/* Discontiguous extents */
> +	if (!imap_spans_range(imap, offset_fsb, end_fsb))

Same here.

> +		if (shared) {
> +			/*
> +			 * Since we got a CoW fork extent mapping, ensure that
> +			 * the mapping is actually suitable for an
> +			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
> +			 * and covers the full range of the write. Otherwise,
> +			 * we need to use the COW-based atomic write mode.
> +			 */
> +			if ((flags & IOMAP_ATOMIC) &&
> +			    !xfs_bmap_valid_for_atomic_write(&cmap,

The "Since.." implies there is something special about COW fork mappings.
But I don't think there is, or am I missing something?
If xfs_bmap_valid_for_atomic_write was properly named and documented
this comment should probably just go away.

> +static int
> +xfs_atomic_write_cow_iomap_begin(

Should the atomic and cow be together for coherent naming?
But even if the naming is coherent it isn't really
self-explanatory, so please add a little top of the function
comment introducing it.

> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> +			&nimaps, 0);
> +	if (error)
> +		goto out_unlock;

Why does this need to read the existing data for mapping?  You'll
overwrite everything through the COW fork anyway.
John Garry March 17, 2025, 8:54 a.m. UTC | #3
>> +		}
>>   		end_fsb = imap.br_startoff + imap.br_blockcount;
>>   		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>>   	}
>>   
>> -	if (imap_needs_alloc(inode, flags, &imap, nimaps))
>> +	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
>> +
>> +	if (flags & IOMAP_ATOMIC) {
>> +		error = -EAGAIN;
>> +		/*
>> +		 * If we allocate less than what is required for the write
>> +		 * then we may end up with multiple mappings, which means that
>> +		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
>> +		 */
>> +		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
>> +			goto out_unlock;
> 
> I have a quick question here. Based on above check it looks like
> allocation requests on a hole or the 1st time allocation (append writes)
> for a given logical range will always be done using CoW fallback
> mechanism, isn't it? 

Right, but...


> So that means HW based multi-fsblock atomic write
> request will only happen for over writes (non-discontigous extent),
> correct?

For an unwritten pre-allocated extent, we can use the REQ_ATOMIC method.

fallocate (without ZERO RANGE) would give a pre-allocated unwritten 
extent, and a write there would not technically be an overwrite.

> 
> Now, it's not always necessary that if we try to allocate an extent for
> the given range, it results into discontiguous extents. e.g. say, if the
> entire range being written to is a hole or append writes, then it might
> just allocate a single unwritten extent which is valid for doing an
> atomic write using HW/BIOs right?

Right

> And it is valid to write using unwritten extent as long as we don't have
> mixed mappings i.e. the entire range should either be unwritten or
> written for the atomic write to be untorned, correct?
> 

We can't write to discontiguous extents, and a mixed mapping would mean 
discontiguous extents.

And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an 
unwritten extent.

> I am guessing this is kept intentional?
> 
Yes

Thanks,
John
John Garry March 17, 2025, 10:18 a.m. UTC | #4
On 17/03/2025 07:26, Christoph Hellwig wrote:
>> +static bool
>> +xfs_bmap_valid_for_atomic_write(
> 
> This is misnamed.  It checks if the hardware offload an be used.

ok, so maybe:

xfs_bmap_atomic_write_hw_possible()?


> 
>> +	/* Misaligned start block wrt size */
>> +	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
>> +		return false;
> 
> It is very obvious that this checks for a natural alignment of the
> block number.  But it fails to explain why it checks for that.

Fine, so it will be something like "atomic writes are required to be 
naturally aligned for disk blocks, which is a block layer rule to ensure 
that we won't straddle any boundary or violate write alignment requirement".

> 
>> +
>> +	/* Discontiguous extents */
>> +	if (!imap_spans_range(imap, offset_fsb, end_fsb))
> 
> Same here.
> 
>> +		if (shared) {
>> +			/*
>> +			 * Since we got a CoW fork extent mapping, ensure that
>> +			 * the mapping is actually suitable for an
>> +			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
>> +			 * and covers the full range of the write. Otherwise,
>> +			 * we need to use the COW-based atomic write mode.
>> +			 */
>> +			if ((flags & IOMAP_ATOMIC) &&
>> +			    !xfs_bmap_valid_for_atomic_write(&cmap,
> 
> The "Since.." implies there is something special about COW fork mappings.
> But I don't think there is, or am I missing something?

nothing special

> If xfs_bmap_valid_for_atomic_write was properly named and documented
> this comment should probably just go away.

sure

> 
>> +static int
>> +xfs_atomic_write_cow_iomap_begin(
> 
> Should the atomic and cow be together for coherent naming?
> But even if the naming is coherent it isn't really
> self-explanatory, so please add a little top of the function
> comment introducing it.

I can add a comment, but please let me know of any name suggestion

> 
>> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>> +			&nimaps, 0);
>> +	if (error)
>> +		goto out_unlock;
> 
> Why does this need to read the existing data for mapping?  You'll
> overwrite everything through the COW fork anyway.
> 

We next call xfs_reflink_allocate_cow(), which uses the imap as the 
basis to carry the offset and count.

Are you hinting at statically declaring imap, like:

struct xfs_bmbt_irec imap = {
	.br_startoff		= offset_fsb,
	.br_startblock		= HOLESTARTBLOCK, //?
	.br_blockcount		= end_fsb - offset_fsb,
	.br_state		= XFS_EXT_UNWRITTEN,
};

Note I am not sure what problems which could be encountered in such an 
approach.
Ritesh Harjani (IBM) March 17, 2025, 2:20 p.m. UTC | #5
John Garry <john.g.garry@oracle.com> writes:

>>> +		}
>>>   		end_fsb = imap.br_startoff + imap.br_blockcount;
>>>   		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>>>   	}
>>>   
>>> -	if (imap_needs_alloc(inode, flags, &imap, nimaps))
>>> +	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
>>> +
>>> +	if (flags & IOMAP_ATOMIC) {
>>> +		error = -EAGAIN;
>>> +		/*
>>> +		 * If we allocate less than what is required for the write
>>> +		 * then we may end up with multiple mappings, which means that
>>> +		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
>>> +		 */
>>> +		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
>>> +			goto out_unlock;
>> 
>> I have a quick question here. Based on above check it looks like
>> allocation requests on a hole or the 1st time allocation (append writes)
>> for a given logical range will always be done using CoW fallback
>> mechanism, isn't it? 
>
> Right, but...
>
>
>> So that means HW based multi-fsblock atomic write
>> request will only happen for over writes (non-discontigous extent),
>> correct?
>
> For an unwritten pre-allocated extent, we can use the REQ_ATOMIC method.
>
> fallocate (without ZERO RANGE) would give a pre-allocated unwritten 
> extent, and a write there would not technically be an overwrite.
>
>> 
>> Now, it's not always necessary that if we try to allocate an extent for
>> the given range, it results into discontiguous extents. e.g. say, if the
>> entire range being written to is a hole or append writes, then it might
>> just allocate a single unwritten extent which is valid for doing an
>> atomic write using HW/BIOs right?
>
> Right
>
>> And it is valid to write using unwritten extent as long as we don't have
>> mixed mappings i.e. the entire range should either be unwritten or
>> written for the atomic write to be untorned, correct?
>> 
>
> We can't write to discontiguous extents, and a mixed mapping would mean 
> discontiguous extents.
>
> And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an 
> unwritten extent.
>
>> I am guessing this is kept intentional?
>> 
> Yes

Thanks, John for addressing the queries. It would be helpful to include
this information in the commit message as well then right? Otherwise
IMO, the original commit message looks incomplete.

Maybe we can add this too?
=========================
This patch adds CoW based atomic write support which will be used as a
SW fallback in following scenarios:

- All append write scenarios.
- Any new writes on the region containing holes.
- Writes to any misaligned regions
- Writes to discontiguous extents.


<original commit msg snip>
=========================
In cases of an atomic write covering misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.

-ritesh
John Garry March 17, 2025, 2:56 p.m. UTC | #6
On 17/03/2025 14:20, Ritesh Harjani (IBM) wrote:
>> And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an
>> unwritten extent.
>>
>>> I am guessing this is kept intentional?
>>>
>> Yes
> Thanks, John for addressing the queries. It would be helpful to include
> this information in the commit message as well then right? Otherwise
> IMO, the original commit message looks incomplete.

ok, fine. I am just worried that these commit messages become too wordy. 
But, if people want this info, then I can provide it.

> 
> Maybe we can add this too?
> =========================
> This patch adds CoW based atomic write support which will be used as a
> SW fallback in following scenarios:
> 
> - All append write scenarios.
> - Any new writes on the region containing holes.

but only for > 1x block.

if we must be able to alloc at least a block :)

> - Writes to any misaligned regions
> - Writes to discontiguous extents.

ok, fine

> 
> 
> <original commit msg snip>
> =========================
> In cases of an atomic write covering misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.

sure

Thanks,
John
hch March 18, 2025, 5:35 a.m. UTC | #7
On Mon, Mar 17, 2025 at 02:56:52PM +0000, John Garry wrote:
> ok, fine. I am just worried that these commit messages become too wordy. 
> But, if people want this info, then I can provide it.

While too wordy commit messages do exit, you're really far from that
threshold.  So yes, please explain this.

I think we also really need a document outside of commit logs and
comments that explains the exact atomic write semantics and how
we implement them between the hardware offload and always COW writes.
hch March 18, 2025, 5:39 a.m. UTC | #8
On Mon, Mar 17, 2025 at 10:18:58AM +0000, John Garry wrote:
> On 17/03/2025 07:26, Christoph Hellwig wrote:
>>> +static bool
>>> +xfs_bmap_valid_for_atomic_write(
>>
>> This is misnamed.  It checks if the hardware offload an be used.
>
> ok, so maybe:
>
> xfs_bmap_atomic_write_hw_possible()?

That does sound better.

> Fine, so it will be something like "atomic writes are required to be 
> naturally aligned for disk blocks, which is a block layer rule to ensure 
> that we won't straddle any boundary or violate write alignment 
> requirement".

Much better!  Maybe spell out the actual block layer rule, though?

>>
>> Should the atomic and cow be together for coherent naming?
>> But even if the naming is coherent it isn't really
>> self-explanatory, so please add a little top of the function
>> comment introducing it.
>
> I can add a comment, but please let me know of any name suggestion

/*
 * Handler for atomic writes implemented by writing out of place through
 * the COW fork.  If possible we try to use hardware provided atomicy
 * instead, which is handled directly in xfs_direct_write_iomap_begin.
 */

>
>>
>>> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>> +			&nimaps, 0);
>>> +	if (error)
>>> +		goto out_unlock;
>>
>> Why does this need to read the existing data for mapping?  You'll
>> overwrite everything through the COW fork anyway.
>>
>
> We next call xfs_reflink_allocate_cow(), which uses the imap as the basis 
> to carry the offset and count.

Is xfs_reflink_allocate_cow even the right helper to use?  We know we
absolutely want a a COW fork extent, we know there can't be delalloc
extent to convert as we flushed dirty data, so most of the logic in it
is pointless.
John Garry March 18, 2025, 8:22 a.m. UTC | #9
On 18/03/2025 05:39, Christoph Hellwig wrote:
> On Mon, Mar 17, 2025 at 10:18:58AM +0000, John Garry wrote:
>> On 17/03/2025 07:26, Christoph Hellwig wrote:
>>>> +static bool
>>>> +xfs_bmap_valid_for_atomic_write(
>>>
>>> This is misnamed.  It checks if the hardware offload an be used.
>>
>> ok, so maybe:
>>
>> xfs_bmap_atomic_write_hw_possible()?
> 
> That does sound better.
> 
>> Fine, so it will be something like "atomic writes are required to be
>> naturally aligned for disk blocks, which is a block layer rule to ensure
>> that we won't straddle any boundary or violate write alignment
>> requirement".
> 
> Much better! 

good

> Maybe spell out the actual block layer rule, though?

ok, fine, so that will be something like "writes need to be naturally 
aligned to ensure that we don't straddle any bdev atomic boundary".

> 
>>>
>>> Should the atomic and cow be together for coherent naming?
>>> But even if the naming is coherent it isn't really
>>> self-explanatory, so please add a little top of the function
>>> comment introducing it.
>>
>> I can add a comment, but please let me know of any name suggestion
> 
> /*
>   * Handler for atomic writes implemented by writing out of place through
>   * the COW fork.  If possible we try to use hardware provided atomicy
>   * instead, which is handled directly in xfs_direct_write_iomap_begin.
>   */

ok, fine

> 
>>
>>>
>>>> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>>> +			&nimaps, 0);
>>>> +	if (error)
>>>> +		goto out_unlock;
>>>
>>> Why does this need to read the existing data for mapping?  You'll
>>> overwrite everything through the COW fork anyway.
>>>
>>
>> We next call xfs_reflink_allocate_cow(), which uses the imap as the basis
>> to carry the offset and count.
> 
> Is xfs_reflink_allocate_cow even the right helper to use?  We know we
> absolutely want a a COW fork extent, we know there can't be delalloc
> extent to convert as we flushed dirty data, so most of the logic in it
> is pointless.

Well xfs_reflink_allocate_cow gives us what we want when we set some 
flag (XFS_REFLINK_FORCE_COW).

Are you hinting at a dedicated helper? Note that 
xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.

>
hch March 18, 2025, 8:32 a.m. UTC | #10
On Tue, Mar 18, 2025 at 08:22:53AM +0000, John Garry wrote:
>> Is xfs_reflink_allocate_cow even the right helper to use?  We know we
>> absolutely want a a COW fork extent, we know there can't be delalloc
>> extent to convert as we flushed dirty data, so most of the logic in it
>> is pointless.
>
> Well xfs_reflink_allocate_cow gives us what we want when we set some flag 
> (XFS_REFLINK_FORCE_COW).
>
> Are you hinting at a dedicated helper? Note that 
> xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.

We might not even need much of a helper.  This all comes from my
experience with the zoned code that always writes out of place as well.
I initially tried to reuse the existing iomap_begin methods and all
these helpers, but in the end it turned out to much, much simpler to
just open code the logic.  Now the atomic cow code will be a little
more complex in some aspect (unwritten extents, speculative
preallocations), but also simpler in others (no need to support buffered
I/O including zeroing and sub-block writes that require the ugly
srcmap based read-modify-write), but I suspect the overall trade-off
will be similar.

After all the high-level logic for the atomic COW iomap_begin really
should just be:

 - take the ilock
 - check the COW fork if there is an extent for the start of the range.
 - If yes:
     - if the extent is unwritten, convert the part overlapping with
       the range to written
     - return the extent
 - If no:
     - allocate a new extent covering the range in the COW fork

Doing this without going down multiple levels of helpers designed for
an entirely different use case will probably simplify things
significantly.
John Garry March 18, 2025, 5:44 p.m. UTC | #11
On 18/03/2025 08:32, Christoph Hellwig wrote:
> On Tue, Mar 18, 2025 at 08:22:53AM +0000, John Garry wrote:
>>> Is xfs_reflink_allocate_cow even the right helper to use?  We know we
>>> absolutely want a a COW fork extent, we know there can't be delalloc
>>> extent to convert as we flushed dirty data, so most of the logic in it
>>> is pointless.
>>
>> Well xfs_reflink_allocate_cow gives us what we want when we set some flag
>> (XFS_REFLINK_FORCE_COW).
>>
>> Are you hinting at a dedicated helper? Note that
>> xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.
> 
> We might not even need much of a helper.  This all comes from my
> experience with the zoned code that always writes out of place as well.
> I initially tried to reuse the existing iomap_begin methods and all
> these helpers, but in the end it turned out to much, much simpler to
> just open code the logic.  Now the atomic cow code will be a little
> more complex in some aspect (unwritten extents, speculative
> preallocations), but also simpler in others (no need to support buffered
> I/O including zeroing and sub-block writes that require the ugly
> srcmap based read-modify-write), but I suspect the overall trade-off
> will be similar.
> 
> After all the high-level logic for the atomic COW iomap_begin really
> should just be:
> 
>   - take the ilock
>   - check the COW fork if there is an extent for the start of the range.
>   - If yes:
>       - if the extent is unwritten, convert the part overlapping with
>         the range to written

I was wondering when it could be written, and then I figured it could be 
if we had this sort of scenario:
- initially we have a mixed of shared and unshared file, like:

mnt/file:
EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          192..199          0 (192..199)           8 100000
   1: [8..15]:         32840..32847      0 (32840..32847)       8 000000
   2: [16..20479]:     208..20671        0 (208..20671)     20464 100000
FLAG Values:
    0100000 Shared extent
    0010000 Unwritten preallocated extent
    0001000 Doesn't begin on stripe unit
    0000100 Doesn't end   on stripe unit
    0000010 Doesn't begin on stripe width
    0000001 Doesn't end   on stripe width

- try atomic write of size 32K @ offset 0
  	- we first try HW atomics method and call 
xfs_direct_write_iomap_begin() -> xfs_reflink_allocate_cow()
	- CoW fork mapping is no good for atomics (too short), so try CoW 
atomic method
- in CoW atomic method, we find that pre-alloc'ed CoW fork extent (which 
was converted to written already)

>       - return the extent
>   - If no:
>       - allocate a new extent covering the range in the COW fork
> 
> Doing this without going down multiple levels of helpers designed for
> an entirely different use case will probably simplify things
> significantly.

Please suggest any further modifications to the following attempt. I 
have XFS_REFLINK_FORCE_COW still being passed to 
xfs_reflink_fill_cow_hole(), but xfs_reflink_fill_cow_hole() is quite a 
large function and I am not sure if I want to duplicate lots of it.

static int
xfs_atomic_write_cow_iomap_begin(
	struct inode		*inode,
	loff_t			offset,
	loff_t			length,
	unsigned		flags,
	struct iomap		*iomap,
	struct iomap		*srcmap)
{
	ASSERT(flags & IOMAP_WRITE);
	ASSERT(flags & IOMAP_DIRECT);

	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
	xfs_fileoff_t		count_fsb = end_fsb - offset_fsb;
	struct xfs_bmbt_irec	imap = {
		.br_startoff = offset_fsb,
		.br_startblock = HOLESTARTBLOCK,
		.br_blockcount = end_fsb - offset_fsb,
		.br_state = XFS_EXT_UNWRITTEN,
	};
	struct xfs_bmbt_irec	cmap;
	bool			shared = false;
	unsigned int		lockmode = XFS_ILOCK_EXCL;
	u64			seq;
	struct xfs_iext_cursor	icur;

	if (xfs_is_shutdown(mp))
		return -EIO;

	if (!xfs_has_reflink(mp))
		return -EINVAL;

	if (!ip->i_cowfp) {
		ASSERT(!xfs_is_reflink_inode(ip));
		xfs_ifork_init_cow(ip);
	}

	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
	if (error)
		return error;

	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap) 
&& cmap.br_startoff <= offset_fsb) {
		if (cmap.br_state == XFS_EXT_UNWRITTEN) {
			xfs_trim_extent(&cmap, offset_fsb, count_fsb);
			
			error = xfs_reflink_convert_unwritten(ip, &imap, &cmap, 
XFS_REFLINK_CONVERT_UNWRITTEN);
			if (error)
				goto out_unlock;
		}
	} else {
		error = xfs_reflink_fill_cow_hole(ip, &imap, &cmap, &shared,
				&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
				XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
		if (error)
			goto out_unlock;
	}

finish:
	... // as before
}

xfs_reflink_convert_unwritten() and others are private to reflink.c, so 
this function should also prob live there.

thanks
hch March 19, 2025, 7:30 a.m. UTC | #12
On Tue, Mar 18, 2025 at 05:44:46PM +0000, John Garry wrote:
> Please suggest any further modifications to the following attempt. I have 
> XFS_REFLINK_FORCE_COW still being passed to xfs_reflink_fill_cow_hole(), 
> but xfs_reflink_fill_cow_hole() is quite a large function and I am not sure 
> if I want to duplicate lots of it.

As said I'd do away with the helpers.  Below is my completely
untested whiteboard coding attempt, based against the series you
sent out.

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 88d86cabb8a1..06ece7070cfd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1083,67 +1083,104 @@ xfs_atomic_write_cow_iomap_begin(
 	struct iomap		*iomap,
 	struct iomap		*srcmap)
 {
-	ASSERT(flags & IOMAP_WRITE);
-	ASSERT(flags & IOMAP_DIRECT);
-
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_bmbt_irec	imap, cmap;
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
-	int			nimaps = 1, error;
-	bool			shared = false;
-	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
+	int			nmaps = 1;
+	xfs_filblks_t		resaligned;
+	struct xfs_bmbt_irec	cmap;
+	struct xfs_iext_cursor	icur;
+	struct xfs_trans	*tp;
+	int			error;
 	u64			seq;
 
+	ASSERT(!XFS_IS_REALTIME_INODE(ip));
+	ASSERT(flags & IOMAP_WRITE);
+	ASSERT(flags & IOMAP_DIRECT);
+
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	if (!xfs_has_reflink(mp))
+	if (WARN_ON_ONCE(!xfs_has_reflink(mp)))
 		return -EINVAL;
 
-	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (!ip->i_cowfp) {
+		ASSERT(!xfs_is_reflink_inode(ip));
+		xfs_ifork_init_cow(ip);
+	}
+
+	/*
+	 * If we don't find an overlapping extent, trim the range we need to
+	 * allocate to fit the hole we found.
+	 */
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+		cmap.br_startoff = end_fsb;
+	if (cmap.br_startoff <= offset_fsb) {
+		xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+		goto found;
+	}
+
+	end_fsb = cmap.br_startoff;
+	count_fsb = end_fsb - offset_fsb;
+	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+			xfs_get_cowextsz_hint(ip));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+			XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
 	if (error)
 		return error;
 
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			&nimaps, 0);
-	if (error)
-		goto out_unlock;
+	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+		cmap.br_startoff = end_fsb;
+	if (cmap.br_startoff <= offset_fsb) {
+		xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+		xfs_trans_cancel(tp);
+		goto found;
+	}
 
-	 /*
-	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
-	  * according to extszhint, such that there will be a greater chance
-	  * that future atomic writes to that same range will be aligned (and
-	  * don't require this COW-based method).
-	  */
-	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
-			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
-			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
 	/*
-	 * Don't check @shared. For atomic writes, we should error when
-	 * we don't get a COW fork extent mapping.
+	 * Allocate the entire reservation as unwritten blocks.
+	 *
+	 * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
+	 * extszhint, such that there will be a greater chance that future
+	 * atomic writes to that same range will be aligned (and don't require
+	 * this COW-based method).
 	 */
-	if (error)
+	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+			XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
+	if (error) {
+		xfs_trans_cancel(tp);
 		goto out_unlock;
+	}
 
-	end_fsb = imap.br_startoff + imap.br_blockcount;
+	xfs_inode_set_cowblocks_tag(ip);
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_unlock;
 
-	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
-	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
-	if (imap.br_startblock != HOLESTARTBLOCK) {
-		seq = xfs_iomap_inode_sequence(ip, 0);
-		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+found:
+	if (cmap.br_state != XFS_EXT_NORM) {
+		error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
+				count_fsb);
 		if (error)
 			goto out_unlock;
+		cmap.br_state = XFS_EXT_NORM;
 	}
+
+	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
 	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
-	xfs_iunlock(ip, lockmode);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
 
 out_unlock:
-	if (lockmode)
-		xfs_iunlock(ip, lockmode);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b983f5413be6..71116e6a692c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
 	return xfs_reflink_trim_around_shared(ip, imap, shared);
 }
 
-static int
+int
 xfs_reflink_convert_cow_locked(
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		offset_fsb,
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 969006661a3f..ab3fa3c95196 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -45,6 +45,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
 		unsigned int flags);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
+int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
+		xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
 
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
 		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
John Garry March 19, 2025, 10:24 a.m. UTC | #13
On 19/03/2025 07:30, Christoph Hellwig wrote:
> On Tue, Mar 18, 2025 at 05:44:46PM +0000, John Garry wrote:
>> Please suggest any further modifications to the following attempt. I have
>> XFS_REFLINK_FORCE_COW still being passed to xfs_reflink_fill_cow_hole(),
>> but xfs_reflink_fill_cow_hole() is quite a large function and I am not sure
>> if I want to duplicate lots of it.
> 
> As said I'd do away with the helpers.  Below is my completely
> untested whiteboard coding attempt, based against the series you
> sent out.

it seems to work ok, cheers

Just a query, below

> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 88d86cabb8a1..06ece7070cfd 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1083,67 +1083,104 @@ xfs_atomic_write_cow_iomap_begin(
>   	struct iomap		*iomap,
>   	struct iomap		*srcmap)
>   {
> -	ASSERT(flags & IOMAP_WRITE);
> -	ASSERT(flags & IOMAP_DIRECT);
> -
>   	struct xfs_inode	*ip = XFS_I(inode);
>   	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_bmbt_irec	imap, cmap;
>   	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>   	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> -	int			nimaps = 1, error;
> -	bool			shared = false;
> -	unsigned int		lockmode = XFS_ILOCK_EXCL;
> +	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
> +	int			nmaps = 1;
> +	xfs_filblks_t		resaligned;
> +	struct xfs_bmbt_irec	cmap;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_trans	*tp;
> +	int			error;
>   	u64			seq;
>   
> +	ASSERT(!XFS_IS_REALTIME_INODE(ip));
> +	ASSERT(flags & IOMAP_WRITE);
> +	ASSERT(flags & IOMAP_DIRECT);
> +
>   	if (xfs_is_shutdown(mp))
>   		return -EIO;
>   
> -	if (!xfs_has_reflink(mp))
> +	if (WARN_ON_ONCE(!xfs_has_reflink(mp)))
>   		return -EINVAL;
>   
> -	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +
> +	if (!ip->i_cowfp) {
> +		ASSERT(!xfs_is_reflink_inode(ip));
> +		xfs_ifork_init_cow(ip);
> +	}
> +
> +	/*
> +	 * If we don't find an overlapping extent, trim the range we need to
> +	 * allocate to fit the hole we found.
> +	 */
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
> +		cmap.br_startoff = end_fsb;
> +	if (cmap.br_startoff <= offset_fsb) {
> +		xfs_trim_extent(&cmap, offset_fsb, count_fsb);
> +		goto found;
> +	}
> +
> +	end_fsb = cmap.br_startoff;
> +	count_fsb = end_fsb - offset_fsb;
> +	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
> +			xfs_get_cowextsz_hint(ip));
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
> +			XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
>   	if (error)
>   		return error;
>   
> -	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> -			&nimaps, 0);
> -	if (error)
> -		goto out_unlock;
> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
> +		cmap.br_startoff = end_fsb;

Do we really need this logic?

offset_fsb does not change, and logically cmap.br_startoff == end_fsb 
already, right?

> +	if (cmap.br_startoff <= offset_fsb) {
> +		xfs_trim_extent(&cmap, offset_fsb, count_fsb);
> +		xfs_trans_cancel(tp);
> +		goto found;
> +	}
>   
> -	 /*
> -	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
> -	  * according to extszhint, such that there will be a greater chance
> -	  * that future atomic writes to that same range will be aligned (and
> -	  * don't require this COW-based method).
> -	  */
> -	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> -			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
> -			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
>   	/*
> -	 * Don't check @shared. For atomic writes, we should error when
> -	 * we don't get a COW fork extent mapping.
> +	 * Allocate the entire reservation as unwritten blocks.
> +	 *
> +	 * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
> +	 * extszhint, such that there will be a greater chance that future
> +	 * atomic writes to that same range will be aligned (and don't require
> +	 * this COW-based method).
>   	 */
> -	if (error)
> +	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
> +			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
> +			XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
hch March 20, 2025, 5:29 a.m. UTC | #14
On Wed, Mar 19, 2025 at 10:24:55AM +0000, John Garry wrote:
> it seems to work ok, cheers

Better test it very well, this was really just intended as a sketch..

>> +	count_fsb = end_fsb - offset_fsb;
>> +	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
>> +			xfs_get_cowextsz_hint(ip));
>> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +
>> +	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
>> +			XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
>>   	if (error)
>>   		return error;
>>   -	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>> -			&nimaps, 0);
>> -	if (error)
>> -		goto out_unlock;
>> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
>> +		cmap.br_startoff = end_fsb;
>
> Do we really need this logic?
>
> offset_fsb does not change, and logically cmap.br_startoff == end_fsb 
> already, right?

Afte unlocking and relocking the ilock the extent layout could have
changed.
John Garry March 20, 2025, 9:49 a.m. UTC | #15
On 20/03/2025 05:29, Christoph Hellwig wrote:
> On Wed, Mar 19, 2025 at 10:24:55AM +0000, John Garry wrote:
>> it seems to work ok, cheers
> 
> Better test it very well, this was really just intended as a sketch..

Sure, I have been testing a lot so far.

I had been using fio in verify mode as a method to check racing threads 
reading and atomically writing the same file range, so I need to ensure 
that it covers the various paths in this function.

> 
>>> +	count_fsb = end_fsb - offset_fsb;
>>> +	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
>>> +			xfs_get_cowextsz_hint(ip));
>>> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>>> +
>>> +	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
>>> +			XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
>>>    	if (error)
>>>    		return error;
>>>    -	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>> -			&nimaps, 0);
>>> -	if (error)
>>> -		goto out_unlock;
>>> +	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
>>> +		cmap.br_startoff = end_fsb;
>>
>> Do we really need this logic?
>>
>> offset_fsb does not change, and logically cmap.br_startoff == end_fsb
>> already, right?
> 
> Afte unlocking and relocking the ilock the extent layout could have
> changed.

ok, understood. Maybe a comment will help understanding that.

>
hch March 20, 2025, 2:12 p.m. UTC | #16
On Thu, Mar 20, 2025 at 09:49:44AM +0000, John Garry wrote:
>>> offset_fsb does not change, and logically cmap.br_startoff == end_fsb
>>> already, right?
>>
>> Afte unlocking and relocking the ilock the extent layout could have
>> changed.
>
> ok, understood. Maybe a comment will help understanding that.

Sure.  As said this was just intended as draft code.  Maybe factoring
out a helper or two would also be useful, although I couldn't think
of a really obvious one.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8196e66b099b..88d86cabb8a1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -798,6 +798,23 @@  imap_spans_range(
 	return true;
 }
 
+static bool
+xfs_bmap_valid_for_atomic_write(
+	struct xfs_bmbt_irec	*imap,
+	xfs_fileoff_t		offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	/* Misaligned start block wrt size */
+	if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+		return false;
+
+	/* Discontiguous extents */
+	if (!imap_spans_range(imap, offset_fsb, end_fsb))
+		return false;
+
+	return true;
+}
+
 static int
 xfs_direct_write_iomap_begin(
 	struct inode		*inode,
@@ -812,10 +829,12 @@  xfs_direct_write_iomap_begin(
 	struct xfs_bmbt_irec	imap, cmap;
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	xfs_fileoff_t		orig_end_fsb = end_fsb;
 	int			nimaps = 1, error = 0;
 	unsigned int		reflink_flags = 0;
 	bool			shared = false;
 	u16			iomap_flags = 0;
+	bool			needs_alloc;
 	unsigned int		lockmode;
 	u64			seq;
 
@@ -877,13 +896,44 @@  xfs_direct_write_iomap_begin(
 				&lockmode, reflink_flags);
 		if (error)
 			goto out_unlock;
-		if (shared)
+		if (shared) {
+			/*
+			 * Since we got a CoW fork extent mapping, ensure that
+			 * the mapping is actually suitable for an
+			 * REQ_ATOMIC-based atomic write, i.e. properly aligned
+			 * and covers the full range of the write. Otherwise,
+			 * we need to use the COW-based atomic write mode.
+			 */
+			if ((flags & IOMAP_ATOMIC) &&
+			    !xfs_bmap_valid_for_atomic_write(&cmap,
+					offset_fsb, end_fsb)) {
+				error = -EAGAIN;
+				goto out_unlock;
+			}
 			goto out_found_cow;
+		}
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
 	}
 
-	if (imap_needs_alloc(inode, flags, &imap, nimaps))
+	needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+	if (flags & IOMAP_ATOMIC) {
+		error = -EAGAIN;
+		/*
+		 * If we allocate less than what is required for the write
+		 * then we may end up with multiple mappings, which means that
+		 * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+		 */
+		if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+			goto out_unlock;
+
+		if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
+				orig_end_fsb))
+			goto out_unlock;
+	}
+
+	if (needs_alloc)
 		goto allocate_blocks;
 
 	/*
@@ -1024,6 +1074,83 @@  const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
 };
 #endif /* CONFIG_XFS_RT */
 
+static int
+xfs_atomic_write_cow_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap,
+	struct iomap		*srcmap)
+{
+	ASSERT(flags & IOMAP_WRITE);
+	ASSERT(flags & IOMAP_DIRECT);
+
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap, cmap;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+	int			nimaps = 1, error;
+	bool			shared = false;
+	unsigned int		lockmode = XFS_ILOCK_EXCL;
+	u64			seq;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	if (!xfs_has_reflink(mp))
+		return -EINVAL;
+
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			&nimaps, 0);
+	if (error)
+		goto out_unlock;
+
+	 /*
+	  * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
+	  * according to extszhint, such that there will be a greater chance
+	  * that future atomic writes to that same range will be aligned (and
+	  * don't require this COW-based method).
+	  */
+	error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+			&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
+			XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
+	/*
+	 * Don't check @shared. For atomic writes, we should error when
+	 * we don't get a COW fork extent mapping.
+	 */
+	if (error)
+		goto out_unlock;
+
+	end_fsb = imap.br_startoff + imap.br_blockcount;
+
+	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+	if (imap.br_startblock != HOLESTARTBLOCK) {
+		seq = xfs_iomap_inode_sequence(ip, 0);
+		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+		if (error)
+			goto out_unlock;
+	}
+	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+	xfs_iunlock(ip, lockmode);
+	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+	if (lockmode)
+		xfs_iunlock(ip, lockmode);
+	return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+	.iomap_begin		= xfs_atomic_write_cow_iomap_begin,
+};
+
 static int
 xfs_dax_write_iomap_end(
 	struct inode		*inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..674f8ac1b9bd 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,6 @@  extern const struct iomap_ops xfs_read_iomap_ops;
 extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/