diff mbox series

[v2,09/11] xfs: Commit CoW-based atomic writes atomically

Message ID 20250213135619.1148432-10-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series large atomic writes for xfs with CoW | expand

Commit Message

John Garry Feb. 13, 2025, 1:56 p.m. UTC
When completing a CoW-based write, each extent range mapping update is
covered by a separate transaction.

For a CoW-based atomic write, all mappings must be changed at once, so
change to use a single transaction.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_file.c    |  5 ++++-
 fs/xfs/xfs_reflink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_reflink.h |  3 +++
 3 files changed, 52 insertions(+), 1 deletion(-)

Comments

Darrick J. Wong Feb. 24, 2025, 8:20 p.m. UTC | #1
On Thu, Feb 13, 2025 at 01:56:17PM +0000, John Garry wrote:
> When completing a CoW-based write, each extent range mapping update is
> covered by a separate transaction.
> 
> For a CoW-based atomic write, all mappings must be changed at once, so
> change to use a single transaction.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_file.c    |  5 ++++-
>  fs/xfs/xfs_reflink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_reflink.h |  3 +++
>  3 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 9762fa503a41..243640fe4874 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
>  	nofs_flag = memalloc_nofs_save();
>  
>  	if (flags & IOMAP_DIO_COW) {
> -		error = xfs_reflink_end_cow(ip, offset, size);
> +		if (iocb->ki_flags & IOCB_ATOMIC)
> +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
> +		else
> +			error = xfs_reflink_end_cow(ip, offset, size);
>  		if (error)
>  			goto out;
>  	}
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 3dab3ba900a3..d097d33dc000 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -986,6 +986,51 @@ xfs_reflink_end_cow(
>  		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>  	return error;
>  }
> +int
> +xfs_reflink_end_atomic_cow(
> +	struct xfs_inode		*ip,
> +	xfs_off_t			offset,
> +	xfs_off_t			count)
> +{
> +	xfs_fileoff_t			offset_fsb;
> +	xfs_fileoff_t			end_fsb;
> +	int				error = 0;
> +	struct xfs_mount		*mp = ip->i_mount;
> +	struct xfs_trans		*tp;
> +	unsigned int			resblks;
> +
> +	trace_xfs_reflink_end_cow(ip, offset, count);
> +
> +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);

Use @mp here instead of walking the pointer.

> +
> +	resblks = (end_fsb - offset_fsb) *
> +			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);

How did you arrive at this computation?  The "b" parameter to
XFS_NEXTENTADD_SPACE_RES is usually the worst case number of mappings
that you're going to change on this file.  I think that quantity is
(end_fsb - offset_fsb)?

> +
> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> +			XFS_TRANS_RESERVE, &tp);
> +	if (error)
> +		return error;
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	xfs_trans_ijoin(tp, ip, 0);
> +
> +	while (end_fsb > offset_fsb && !error)
> +		error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
> +							end_fsb);

Overly long line, and the continuation line only needs to be indented
two more tabs.

> +
> +	if (error) {
> +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> +		goto out_cancel;
> +	}
> +	error = xfs_trans_commit(tp);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return 0;

Why is it ok to drop @error here?  Shouldn't a transaction commit error
should be reported to the writer thread?

--D

> +out_cancel:
> +	xfs_trans_cancel(tp);
> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +	return error;
> +}
>  
>  /*
>   * Free all CoW staging blocks that are still referenced by the ondisk refcount
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 754d2bb692d3..ca945d98e823 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -43,6 +43,9 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count, bool cancel_real);
>  extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
> +		int
> +xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
> +		xfs_off_t count);
>  extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
>  extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
>  		struct file *file_out, loff_t pos_out, loff_t len,
> -- 
> 2.31.1
> 
>
John Garry Feb. 25, 2025, 11:11 a.m. UTC | #2
On 24/02/2025 20:20, Darrick J. Wong wrote:
> On Thu, Feb 13, 2025 at 01:56:17PM +0000, John Garry wrote:
>> When completing a CoW-based write, each extent range mapping update is
>> covered by a separate transaction.
>>
>> For a CoW-based atomic write, all mappings must be changed at once, so
>> change to use a single transaction.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/xfs_file.c    |  5 ++++-
>>   fs/xfs/xfs_reflink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
>>   fs/xfs/xfs_reflink.h |  3 +++
>>   3 files changed, 52 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 9762fa503a41..243640fe4874 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
>>   	nofs_flag = memalloc_nofs_save();
>>   
>>   	if (flags & IOMAP_DIO_COW) {
>> -		error = xfs_reflink_end_cow(ip, offset, size);
>> +		if (iocb->ki_flags & IOCB_ATOMIC)
>> +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
>> +		else
>> +			error = xfs_reflink_end_cow(ip, offset, size);
>>   		if (error)
>>   			goto out;
>>   	}
>> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
>> index 3dab3ba900a3..d097d33dc000 100644
>> --- a/fs/xfs/xfs_reflink.c
>> +++ b/fs/xfs/xfs_reflink.c
>> @@ -986,6 +986,51 @@ xfs_reflink_end_cow(
>>   		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>>   	return error;
>>   }
>> +int
>> +xfs_reflink_end_atomic_cow(
>> +	struct xfs_inode		*ip,
>> +	xfs_off_t			offset,
>> +	xfs_off_t			count)
>> +{
>> +	xfs_fileoff_t			offset_fsb;
>> +	xfs_fileoff_t			end_fsb;
>> +	int				error = 0;
>> +	struct xfs_mount		*mp = ip->i_mount;
>> +	struct xfs_trans		*tp;
>> +	unsigned int			resblks;
>> +
>> +	trace_xfs_reflink_end_cow(ip, offset, count);
>> +
>> +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>> +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
> 
> Use @mp here instead of walking the pointer.

Yes

> 
>> +
>> +	resblks = (end_fsb - offset_fsb) *
>> +			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
> 
> How did you arrive at this computation? 

hmmm... you suggested this, but maybe I picked it up incorrectly :)

> The "b" parameter to
> XFS_NEXTENTADD_SPACE_RES is usually the worst case number of mappings
> that you're going to change on this file.  I think that quantity is
> (end_fsb - offset_fsb)?

Can you please check this versus what you suggested in 
https://lore.kernel.org/linux-xfs/20250206215014.GX21808@frogsfrogsfrogs/#t

> 
>> +
>> +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
>> +			XFS_TRANS_RESERVE, &tp);
>> +	if (error)
>> +		return error;
>> +
>> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
>> +	xfs_trans_ijoin(tp, ip, 0);
>> +
>> +	while (end_fsb > offset_fsb && !error)
>> +		error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
>> +							end_fsb);
> 
> Overly long line, and the continuation line only needs to be indented
> two more tabs.

ok

> 
>> +
>> +	if (error) {
>> +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
>> +		goto out_cancel;
>> +	}
>> +	error = xfs_trans_commit(tp);
>> +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +	return 0;
> 
> Why is it ok to drop @error here?  Shouldn't a transaction commit error
> should be reported to the writer thread?
>

I can fix that, as I should not ignore errors from xfs_trans_commit()

Thanks,
John
Darrick J. Wong Feb. 25, 2025, 5:50 p.m. UTC | #3
On Tue, Feb 25, 2025 at 11:11:45AM +0000, John Garry wrote:
> On 24/02/2025 20:20, Darrick J. Wong wrote:
> > On Thu, Feb 13, 2025 at 01:56:17PM +0000, John Garry wrote:
> > > When completing a CoW-based write, each extent range mapping update is
> > > covered by a separate transaction.
> > > 
> > > For a CoW-based atomic write, all mappings must be changed at once, so
> > > change to use a single transaction.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/xfs_file.c    |  5 ++++-
> > >   fs/xfs/xfs_reflink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/xfs/xfs_reflink.h |  3 +++
> > >   3 files changed, 52 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 9762fa503a41..243640fe4874 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -527,7 +527,10 @@ xfs_dio_write_end_io(
> > >   	nofs_flag = memalloc_nofs_save();
> > >   	if (flags & IOMAP_DIO_COW) {
> > > -		error = xfs_reflink_end_cow(ip, offset, size);
> > > +		if (iocb->ki_flags & IOCB_ATOMIC)
> > > +			error = xfs_reflink_end_atomic_cow(ip, offset, size);
> > > +		else
> > > +			error = xfs_reflink_end_cow(ip, offset, size);
> > >   		if (error)
> > >   			goto out;
> > >   	}
> > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > index 3dab3ba900a3..d097d33dc000 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -986,6 +986,51 @@ xfs_reflink_end_cow(
> > >   		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> > >   	return error;
> > >   }
> > > +int
> > > +xfs_reflink_end_atomic_cow(
> > > +	struct xfs_inode		*ip,
> > > +	xfs_off_t			offset,
> > > +	xfs_off_t			count)
> > > +{
> > > +	xfs_fileoff_t			offset_fsb;
> > > +	xfs_fileoff_t			end_fsb;
> > > +	int				error = 0;
> > > +	struct xfs_mount		*mp = ip->i_mount;
> > > +	struct xfs_trans		*tp;
> > > +	unsigned int			resblks;
> > > +
> > > +	trace_xfs_reflink_end_cow(ip, offset, count);
> > > +
> > > +	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> > > +	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
> > 
> > Use @mp here instead of walking the pointer.
> 
> Yes
> 
> > 
> > > +
> > > +	resblks = (end_fsb - offset_fsb) *
> > > +			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
> > 
> > How did you arrive at this computation?
> 
> hmmm... you suggested this, but maybe I picked it up incorrectly :)
> 
> > The "b" parameter to
> > XFS_NEXTENTADD_SPACE_RES is usually the worst case number of mappings
> > that you're going to change on this file.  I think that quantity is
> > (end_fsb - offset_fsb)?
> 
> Can you please check this versus what you suggested in
> https://lore.kernel.org/linux-xfs/20250206215014.GX21808@frogsfrogsfrogs/#t

Ah, yeah, that ^^ is correct.  This needs a better comment then:

	/*
	 * Each remapping operation could cause a btree split, so in
	 * the worst case that's one for each block.
	 */
	resblks = (end_fsb - offset_fsb) *
			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);

--D

> > 
> > > +
> > > +	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> > > +			XFS_TRANS_RESERVE, &tp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > > +	xfs_trans_ijoin(tp, ip, 0);
> > > +
> > > +	while (end_fsb > offset_fsb && !error)
> > > +		error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
> > > +							end_fsb);
> > 
> > Overly long line, and the continuation line only needs to be indented
> > two more tabs.
> 
> ok
> 
> > 
> > > +
> > > +	if (error) {
> > > +		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> > > +		goto out_cancel;
> > > +	}
> > > +	error = xfs_trans_commit(tp);
> > > +	xfs_iunlock(ip, XFS_ILOCK_EXCL);
> > > +	return 0;
> > 
> > Why is it ok to drop @error here?  Shouldn't a transaction commit error
> > should be reported to the writer thread?
> > 
> 
> I can fix that, as I should not ignore errors from xfs_trans_commit()
> 
> Thanks,
> John
>
John Garry Feb. 25, 2025, 6:07 p.m. UTC | #4
On 25/02/2025 17:50, Darrick J. Wong wrote:
>> Can you please check this versus what you suggested in
>> https://lore.kernel.org/linux- 
>> xfs/20250206215014.GX21808@frogsfrogsfrogs/#t
> Ah, yeah, that ^^ is correct.  This needs a better comment then:
> 
> 	/*
> 	 * Each remapping operation could cause a btree split, so in
> 	 * the worst case that's one for each block.
> 	 */
> 	resblks = (end_fsb - offset_fsb) *
> 			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);

ok, fine.

Cheers
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9762fa503a41..243640fe4874 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -527,7 +527,10 @@  xfs_dio_write_end_io(
 	nofs_flag = memalloc_nofs_save();
 
 	if (flags & IOMAP_DIO_COW) {
-		error = xfs_reflink_end_cow(ip, offset, size);
+		if (iocb->ki_flags & IOCB_ATOMIC)
+			error = xfs_reflink_end_atomic_cow(ip, offset, size);
+		else
+			error = xfs_reflink_end_cow(ip, offset, size);
 		if (error)
 			goto out;
 	}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3dab3ba900a3..d097d33dc000 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -986,6 +986,51 @@  xfs_reflink_end_cow(
 		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
 	return error;
 }
+int
+xfs_reflink_end_atomic_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	int				error = 0;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_trans		*tp;
+	unsigned int			resblks;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	resblks = (end_fsb - offset_fsb) *
+			XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	while (end_fsb > offset_fsb && !error)
+		error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+							end_fsb);
+
+	if (error) {
+		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+		goto out_cancel;
+	}
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
 
 /*
  * Free all CoW staging blocks that are still referenced by the ondisk refcount
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 754d2bb692d3..ca945d98e823 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -43,6 +43,9 @@  extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count, bool cancel_real);
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
+		int
+xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
 extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
 		struct file *file_out, loff_t pos_out, loff_t len,