diff mbox series

[v4,09/12] xfs: Add xfs_file_dio_write_atomic()

Message ID 20250303171120.2837067-10-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series large atomic writes for xfs with CoW | expand

Commit Message

John Garry March 3, 2025, 5:11 p.m. UTC
Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.

In case of -EAGAIN being returned from iomap_dio_rw(), reissue the write
in CoW-based atomic write mode.

For CoW-based mode, ensure that we have no outstanding IOs which we
may trample on.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/xfs/xfs_file.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

Comments

Ritesh Harjani (IBM) March 10, 2025, 1:39 p.m. UTC | #1
John Garry <john.g.garry@oracle.com> writes:

> Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.
>
> In case of -EAGAIN being returned from iomap_dio_rw(), reissue the write
> in CoW-based atomic write mode.
>
> For CoW-based mode, ensure that we have no outstanding IOs which we
> may trample on.
>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/xfs/xfs_file.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 42 insertions(+)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 51b4a43d15f3..70eb6928cf63 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -619,6 +619,46 @@ xfs_file_dio_write_aligned(
>  	return ret;
>  }
>  
> +static noinline ssize_t
> +xfs_file_dio_write_atomic(
> +	struct xfs_inode	*ip,
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	unsigned int		iolock = XFS_IOLOCK_SHARED;
> +	unsigned int		dio_flags = 0;
> +	ssize_t			ret;
> +
> +retry:
> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
> +	if (ret)
> +		return ret;
> +
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	if (dio_flags & IOMAP_DIO_FORCE_WAIT)
> +		inode_dio_wait(VFS_I(ip));
> +
> +	trace_xfs_file_direct_write(iocb, from);
> +	ret = iomap_dio_rw(iocb, from, &xfs_atomic_write_iomap_ops,
> +			&xfs_dio_write_ops, dio_flags, NULL, 0);
> +
> +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
> +	    !(dio_flags & IOMAP_DIO_ATOMIC_SW)) {
> +		xfs_iunlock(ip, iolock);
> +		dio_flags = IOMAP_DIO_ATOMIC_SW | IOMAP_DIO_FORCE_WAIT;
> +		iolock = XFS_IOLOCK_EXCL;
> +		goto retry;
> +	}

IIUC typically filesystems can now implement support for IOMAP_ATOMIC_SW
as a fallback mechanism, by returning -EAGAIN error during
IOMAP_ATOMIC_HW handling from their ->iomap_begin() routine.  They can
then retry the entire DIO operation of iomap_dio_rw() by passing
IOMAP_DIO_ATOMIC_SW flag in their dio_flags argument and handle
IOMAP_ATOMIC_SW fallback differently in their ->iomap_begin() routine.

However, -EAGAIN can also be returned when there is a race with mmap
writes for the same range. We return the same -EAGAIN error during page
cache invalidation failure for IOCB_ATOMIC writes too.  However, current
code does not differentiate between these two types of failures. Therefore,
we always retry by falling back to SW CoW based atomic write even for
page cache invalidation failures.

__iomap_dio_rw()
{
<...>
		/*
		 * Try to invalidate cache pages for the range we are writing.
		 * If this invalidation fails, let the caller fall back to
		 * buffered I/O.
		 */
		ret = kiocb_invalidate_pages(iocb, iomi.len);
		if (ret) {
			if (ret != -EAGAIN) {
				trace_iomap_dio_invalidate_fail(inode, iomi.pos,
								iomi.len);
				if (iocb->ki_flags & IOCB_ATOMIC) {
					/*
					 * folio invalidation failed, maybe
					 * this is transient, unlock and see if
					 * the caller tries again.
					 */
					ret = -EAGAIN;
				} else {
					/* fall back to buffered write */
					ret = -ENOTBLK;
				}
			}
			goto out_free_dio;
		}
<...>
}

It's easy to miss such error handling conditions. If this is something
which was already discussed earlier, then perhaps it is better if
document this.  BTW - Is this something that we already know of and has
been kept as such intentionally?


-ritesh


> +
> +out_unlock:
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
> +	return ret;
> +}
> +
>  /*
>   * Handle block unaligned direct I/O writes
>   *
> @@ -723,6 +763,8 @@ xfs_file_dio_write(
>  		return -EINVAL;
>  	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
>  		return xfs_file_dio_write_unaligned(ip, iocb, from);
> +	if (iocb->ki_flags & IOCB_ATOMIC)
> +		return xfs_file_dio_write_atomic(ip, iocb, from);
>  	return xfs_file_dio_write_aligned(ip, iocb, from);
>  }
>  
> -- 
> 2.31.1
John Garry March 10, 2025, 3:24 p.m. UTC | #2
On 10/03/2025 13:39, Ritesh Harjani (IBM) wrote:
> John Garry <john.g.garry@oracle.com> writes:
> 
>> Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.
>>
>> In case of -EAGAIN being returned from iomap_dio_rw(), reissue the write
>> in CoW-based atomic write mode.
>>
>> For CoW-based mode, ensure that we have no outstanding IOs which we
>> may trample on.
>>
>> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/xfs/xfs_file.c | 42 ++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 42 insertions(+)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 51b4a43d15f3..70eb6928cf63 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -619,6 +619,46 @@ xfs_file_dio_write_aligned(
>>   	return ret;
>>   }
>>   
>> +static noinline ssize_t
>> +xfs_file_dio_write_atomic(
>> +	struct xfs_inode	*ip,
>> +	struct kiocb		*iocb,
>> +	struct iov_iter		*from)
>> +{
>> +	unsigned int		iolock = XFS_IOLOCK_SHARED;
>> +	unsigned int		dio_flags = 0;
>> +	ssize_t			ret;
>> +
>> +retry:
>> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = xfs_file_write_checks(iocb, from, &iolock);
>> +	if (ret)
>> +		goto out_unlock;
>> +
>> +	if (dio_flags & IOMAP_DIO_FORCE_WAIT)
>> +		inode_dio_wait(VFS_I(ip));
>> +
>> +	trace_xfs_file_direct_write(iocb, from);
>> +	ret = iomap_dio_rw(iocb, from, &xfs_atomic_write_iomap_ops,
>> +			&xfs_dio_write_ops, dio_flags, NULL, 0);
>> +
>> +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
>> +	    !(dio_flags & IOMAP_DIO_ATOMIC_SW)) {
>> +		xfs_iunlock(ip, iolock);
>> +		dio_flags = IOMAP_DIO_ATOMIC_SW | IOMAP_DIO_FORCE_WAIT;
>> +		iolock = XFS_IOLOCK_EXCL;
>> +		goto retry;
>> +	}
> 
> IIUC typically filesystems can now implement support for IOMAP_ATOMIC_SW
> as a fallback mechanism, by returning -EAGAIN error during
> IOMAP_ATOMIC_HW handling from their ->iomap_begin() routine.  They can
> then retry the entire DIO operation of iomap_dio_rw() by passing
> IOMAP_DIO_ATOMIC_SW flag in their dio_flags argument and handle
> IOMAP_ATOMIC_SW fallback differently in their ->iomap_begin() routine.
> 
> However, -EAGAIN can also be returned when there is a race with mmap
> writes for the same range. We return the same -EAGAIN error during page
> cache invalidation failure for IOCB_ATOMIC writes too.  However, current
> code does not differentiate between these two types of failures. Therefore,
> we always retry by falling back to SW CoW based atomic write even for
> page cache invalidation failures.
> 
> __iomap_dio_rw()
> {
> <...>
> 		/*
> 		 * Try to invalidate cache pages for the range we are writing.
> 		 * If this invalidation fails, let the caller fall back to
> 		 * buffered I/O.
> 		 */
> 		ret = kiocb_invalidate_pages(iocb, iomi.len);
> 		if (ret) {
> 			if (ret != -EAGAIN) {
> 				trace_iomap_dio_invalidate_fail(inode, iomi.pos,
> 								iomi.len);
> 				if (iocb->ki_flags & IOCB_ATOMIC) {
> 					/*
> 					 * folio invalidation failed, maybe
> 					 * this is transient, unlock and see if
> 					 * the caller tries again.
> 					 */
> 					ret = -EAGAIN;
> 				} else {
> 					/* fall back to buffered write */
> 					ret = -ENOTBLK;
> 				}
> 			}
> 			goto out_free_dio;
> 		}
> <...>
> }
> 
> It's easy to miss such error handling conditions. If this is something
> which was already discussed earlier, then perhaps it is better if
> document this.  BTW - Is this something that we already know of and has
> been kept as such intentionally?
> 

On mainline, for kiocb_invalidate_pages() error for IOCB_ATOMIC, we 
always return -EAGAIN to userspace.

Now if we have any kiocb_invalidate_pages() error for IOCB_ATOMIC, we 
retry with SW CoW mode - and if it fails again, we return -EAGAIN to 
userspace.

If we choose some other error code to trigger the SW-based COW retry (so 
that we don't always retry for kiocb_invalidate_pages() error when 
!IOMAP_DIO_ATOMIC_HW), then kiocb_invalidate_pages() could still return 
that same error code and we still retry in SW-based COW mode - is that 
better? Or do we need to choose some error code which 
kiocb_invalidate_pages() would never return?

Note that -EAGAIN is used by xfs_file_dio_unwrite_unaligned(), so would 
be nice to use the same error code.

Thanks,
John
Darrick J. Wong March 10, 2025, 5:25 p.m. UTC | #3
On Mon, Mar 10, 2025 at 03:24:23PM +0000, John Garry wrote:
> On 10/03/2025 13:39, Ritesh Harjani (IBM) wrote:
> > John Garry <john.g.garry@oracle.com> writes:
> > 
> > > Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.
> > > 
> > > In case of -EAGAIN being returned from iomap_dio_rw(), reissue the write
> > > in CoW-based atomic write mode.
> > > 
> > > For CoW-based mode, ensure that we have no outstanding IOs which we
> > > may trample on.
> > > 
> > > Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/xfs/xfs_file.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 42 insertions(+)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 51b4a43d15f3..70eb6928cf63 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -619,6 +619,46 @@ xfs_file_dio_write_aligned(
> > >   	return ret;
> > >   }
> > > +static noinline ssize_t
> > > +xfs_file_dio_write_atomic(
> > > +	struct xfs_inode	*ip,
> > > +	struct kiocb		*iocb,
> > > +	struct iov_iter		*from)
> > > +{
> > > +	unsigned int		iolock = XFS_IOLOCK_SHARED;
> > > +	unsigned int		dio_flags = 0;
> > > +	ssize_t			ret;
> > > +
> > > +retry:
> > > +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	ret = xfs_file_write_checks(iocb, from, &iolock);
> > > +	if (ret)
> > > +		goto out_unlock;
> > > +
> > > +	if (dio_flags & IOMAP_DIO_FORCE_WAIT)
> > > +		inode_dio_wait(VFS_I(ip));
> > > +
> > > +	trace_xfs_file_direct_write(iocb, from);
> > > +	ret = iomap_dio_rw(iocb, from, &xfs_atomic_write_iomap_ops,
> > > +			&xfs_dio_write_ops, dio_flags, NULL, 0);
> > > +
> > > +	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
> > > +	    !(dio_flags & IOMAP_DIO_ATOMIC_SW)) {
> > > +		xfs_iunlock(ip, iolock);
> > > +		dio_flags = IOMAP_DIO_ATOMIC_SW | IOMAP_DIO_FORCE_WAIT;
> > > +		iolock = XFS_IOLOCK_EXCL;
> > > +		goto retry;
> > > +	}
> > 
> > IIUC typically filesystems can now implement support for IOMAP_ATOMIC_SW
> > as a fallback mechanism, by returning -EAGAIN error during
> > IOMAP_ATOMIC_HW handling from their ->iomap_begin() routine.  They can
> > then retry the entire DIO operation of iomap_dio_rw() by passing
> > IOMAP_DIO_ATOMIC_SW flag in their dio_flags argument and handle
> > IOMAP_ATOMIC_SW fallback differently in their ->iomap_begin() routine.
> > 
> > However, -EAGAIN can also be returned when there is a race with mmap
> > writes for the same range. We return the same -EAGAIN error during page
> > cache invalidation failure for IOCB_ATOMIC writes too.  However, current
> > code does not differentiate between these two types of failures. Therefore,
> > we always retry by falling back to SW CoW based atomic write even for
> > page cache invalidation failures.
> > 
> > __iomap_dio_rw()
> > {
> > <...>
> > 		/*
> > 		 * Try to invalidate cache pages for the range we are writing.
> > 		 * If this invalidation fails, let the caller fall back to
> > 		 * buffered I/O.
> > 		 */
> > 		ret = kiocb_invalidate_pages(iocb, iomi.len);
> > 		if (ret) {
> > 			if (ret != -EAGAIN) {
> > 				trace_iomap_dio_invalidate_fail(inode, iomi.pos,
> > 								iomi.len);
> > 				if (iocb->ki_flags & IOCB_ATOMIC) {
> > 					/*
> > 					 * folio invalidation failed, maybe
> > 					 * this is transient, unlock and see if
> > 					 * the caller tries again.
> > 					 */
> > 					ret = -EAGAIN;
> > 				} else {
> > 					/* fall back to buffered write */
> > 					ret = -ENOTBLK;
> > 				}
> > 			}
> > 			goto out_free_dio;
> > 		}
> > <...>
> > }
> > 
> > It's easy to miss such error handling conditions. If this is something
> > which was already discussed earlier, then perhaps it is better if
> > document this.  BTW - Is this something that we already know of and has
> > been kept as such intentionally?
> > 
> 
> On mainline, for kiocb_invalidate_pages() error for IOCB_ATOMIC, we always
> return -EAGAIN to userspace.
> 
> Now if we have any kiocb_invalidate_pages() error for IOCB_ATOMIC, we retry
> with SW CoW mode - and if it fails again, we return -EAGAIN to userspace.
> 
> If we choose some other error code to trigger the SW-based COW retry (so
> that we don't always retry for kiocb_invalidate_pages() error when
> !IOMAP_DIO_ATOMIC_HW), then kiocb_invalidate_pages() could still return that
> same error code and we still retry in SW-based COW mode - is that better? Or
> do we need to choose some error code which kiocb_invalidate_pages() would
> never return?
> 
> Note that -EAGAIN is used by xfs_file_dio_unwrite_unaligned(), so would be
> nice to use the same error code.

Frankly I don't see why it's a problem that EAGAIN triggers the software
fallback no matter what tripped that.  Maybe the writer would be ok with
the retry even if it came from an (unlikely) mmap write collision.

--D

> Thanks,
> John
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 51b4a43d15f3..70eb6928cf63 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -619,6 +619,46 @@  xfs_file_dio_write_aligned(
 	return ret;
 }
 
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	unsigned int		iolock = XFS_IOLOCK_SHARED;
+	unsigned int		dio_flags = 0;
+	ssize_t			ret;
+
+retry:
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+	if (ret)
+		return ret;
+
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out_unlock;
+
+	if (dio_flags & IOMAP_DIO_FORCE_WAIT)
+		inode_dio_wait(VFS_I(ip));
+
+	trace_xfs_file_direct_write(iocb, from);
+	ret = iomap_dio_rw(iocb, from, &xfs_atomic_write_iomap_ops,
+			&xfs_dio_write_ops, dio_flags, NULL, 0);
+
+	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
+	    !(dio_flags & IOMAP_DIO_ATOMIC_SW)) {
+		xfs_iunlock(ip, iolock);
+		dio_flags = IOMAP_DIO_ATOMIC_SW | IOMAP_DIO_FORCE_WAIT;
+		iolock = XFS_IOLOCK_EXCL;
+		goto retry;
+	}
+
+out_unlock:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	return ret;
+}
+
 /*
  * Handle block unaligned direct I/O writes
  *
@@ -723,6 +763,8 @@  xfs_file_dio_write(
 		return -EINVAL;
 	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
 		return xfs_file_dio_write_unaligned(ip, iocb, from);
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		return xfs_file_dio_write_atomic(ip, iocb, from);
 	return xfs_file_dio_write_aligned(ip, iocb, from);
 }