diff mbox series

[v10,5/8] fs: iomap: Atomic write support

Message ID 20241019125113.369994-6-john.g.garry@oracle.com (mailing list archive)
State Under Review
Headers show
Series block atomic writes for xfs | expand

Commit Message

John Garry Oct. 19, 2024, 12:51 p.m. UTC
Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
flag set.

Initially FSes (XFS) should only support writing a single FS block
atomically.

As with any atomic write, we should produce a single bio which covers the
complete write length.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 .../filesystems/iomap/operations.rst          | 12 ++++++
 fs/iomap/direct-io.c                          | 38 +++++++++++++++++--
 fs/iomap/trace.h                              |  3 +-
 include/linux/iomap.h                         |  1 +
 4 files changed, 49 insertions(+), 5 deletions(-)

Comments

Ritesh Harjani (IBM) Oct. 20, 2024, 8:21 a.m. UTC | #1
John Garry <john.g.garry@oracle.com> writes:

> Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
> flag set.
>
> Initially FSes (XFS) should only support writing a single FS block
> atomically.
>
> As with any atomic write, we should produce a single bio which covers the
> complete write length.
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  .../filesystems/iomap/operations.rst          | 12 ++++++
>  fs/iomap/direct-io.c                          | 38 +++++++++++++++++--
>  fs/iomap/trace.h                              |  3 +-
>  include/linux/iomap.h                         |  1 +
>  4 files changed, 49 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
> index b93115ab8748..529f81dd3d2c 100644
> --- a/Documentation/filesystems/iomap/operations.rst
> +++ b/Documentation/filesystems/iomap/operations.rst
> @@ -513,6 +513,18 @@ IOMAP_WRITE`` with any combination of the following enhancements:
>     if the mapping is unwritten and the filesystem cannot handle zeroing
>     the unaligned regions without exposing stale contents.
>  
> + * ``IOMAP_ATOMIC``: This write is being issued with torn-write
> +   protection.
> +   Only a single bio can be created for the write, and the write must
> +   not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
> +   set.
> +   The file range to write must be aligned to satisfy the requirements
> +   of both the filesystem and the underlying block device's atomic
> +   commit capabilities.
> +   If filesystem metadata updates are required (e.g. unwritten extent
> +   conversion or copy on write), all updates for the entire file range
> +   must be committed atomically as well.
> +
>  Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
>  calling this function.
>  
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f637aa0706a3..ed4764e3b8f0 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>   * clearing the WRITE_THROUGH flag in the dio request.
>   */
>  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> -		const struct iomap *iomap, bool use_fua)
> +		const struct iomap *iomap, bool use_fua, bool atomic)
>  {
>  	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
>  
> @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  		opflags |= REQ_FUA;
>  	else
>  		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> +	if (atomic)
> +		opflags |= REQ_ATOMIC;
>  
>  	return opflags;
>  }
> @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
>  	unsigned int fs_block_size = i_blocksize(inode), pad;
> -	loff_t length = iomap_length(iter);
> +	const loff_t length = iomap_length(iter);
> +	bool atomic = iter->flags & IOMAP_ATOMIC;
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
>  	struct bio *bio;
> @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> +	if (atomic && length != fs_block_size)
> +		return -EINVAL;

We anyway mandate iov_iter_count() write should be same as sb_blocksize
in xfs_file_write_iter() for atomic writes.
This comparison here is not required. I believe we do plan to lift this
restriction maybe when we are going to add forcealign support right? 

And similarly this needs to be lifted when ext4 adds support for atomic
write even with bigalloc. I hope we can do so when we add such support, right?

(I guess, that is also the reason we haven't mentioned this restriction
in description of "IOMAP_ATOMIC" in Documentation.)

-ritesh
John Garry Oct. 20, 2024, 11:21 a.m. UTC | #2
On 20/10/2024 09:21, Ritesh Harjani (IBM) wrote:
>>   -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>>   	unsigned int fs_block_size = i_blocksize(inode), pad;
>> -	loff_t length = iomap_length(iter);
>> +	const loff_t length = iomap_length(iter);
>> +	bool atomic = iter->flags & IOMAP_ATOMIC;
>>   	loff_t pos = iter->pos;
>>   	blk_opf_t bio_opf;
>>   	struct bio *bio;
>> @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	size_t copied = 0;
>>   	size_t orig_count;
>>   
>> +	if (atomic && length != fs_block_size)
>> +		return -EINVAL;
> We anyway mandate iov_iter_count() write should be same as sb_blocksize
> in xfs_file_write_iter() for atomic writes.
> This comparison here is not required. I believe we do plan to lift this
> restriction maybe when we are going to add forcealign support right?

Yes, we would lift this restriction if and when forcealign is added. Or 
when bigalloc is leveraged for ext4 atomic writes.

But I think that today it is proper to add this check, as we are saying 
that iomap DIO path does not support anything else than fs_block_size.

For forcealign, we were introducing support for atomic writes spanning 
mixed unwritten and written extents in [0]. We don't have that support 
here, so it is prudent to say that we just support fs_block_size.

[0] 
https://lore.kernel.org/linux-xfs/20240607143919.2622319-4-john.g.garry@oracle.com/

> 
> And similarly this needs to be lifted when ext4 adds support for atomic
> write even with bigalloc. I hope we can do so when we add such support, right?

Right

> 
> (I guess, that is also the reason we haven't mentioned this restriction
> in description of "IOMAP_ATOMIC" in Documentation.)

The documentation does not mention the size, but that is not intentional.

Thanks,
John
Ritesh Harjani (IBM) Oct. 20, 2024, 11:37 a.m. UTC | #3
John Garry <john.g.garry@oracle.com> writes:

> On 20/10/2024 09:21, Ritesh Harjani (IBM) wrote:
>>>   -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>>   	const struct iomap *iomap = &iter->iomap;
>>>   	struct inode *inode = iter->inode;
>>>   	unsigned int fs_block_size = i_blocksize(inode), pad;
>>> -	loff_t length = iomap_length(iter);
>>> +	const loff_t length = iomap_length(iter);
>>> +	bool atomic = iter->flags & IOMAP_ATOMIC;
>>>   	loff_t pos = iter->pos;
>>>   	blk_opf_t bio_opf;
>>>   	struct bio *bio;
>>> @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>>   	size_t copied = 0;
>>>   	size_t orig_count;
>>>   
>>> +	if (atomic && length != fs_block_size)
>>> +		return -EINVAL;
>> We anyway mandate iov_iter_count() write should be same as sb_blocksize
>> in xfs_file_write_iter() for atomic writes.
>> This comparison here is not required. I believe we do plan to lift this
>> restriction maybe when we are going to add forcealign support right?
>
> Yes, we would lift this restriction if and when forcealign is added. Or 
> when bigalloc is leveraged for ext4 atomic writes.
>
> But I think that today it is proper to add this check, as we are saying 
> that iomap DIO path does not support anything else than fs_block_size.
>
> For forcealign, we were introducing support for atomic writes spanning 
> mixed unwritten and written extents in [0]. We don't have that support 
> here, so it is prudent to say that we just support fs_block_size.
>
> [0] 
> https://lore.kernel.org/linux-xfs/20240607143919.2622319-4-john.g.garry@oracle.com/
>

Sure.

>> 
>> And similarly this needs to be lifted when ext4 adds support for atomic
>> write even with bigalloc. I hope we can do so when we add such support, right?
>
> Right
>

Thanks for confirming that.
The patch looks good to me. Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
diff mbox series

Patch

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index b93115ab8748..529f81dd3d2c 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -513,6 +513,18 @@  IOMAP_WRITE`` with any combination of the following enhancements:
    if the mapping is unwritten and the filesystem cannot handle zeroing
    the unaligned regions without exposing stale contents.
 
+ * ``IOMAP_ATOMIC``: This write is being issued with torn-write
+   protection.
+   Only a single bio can be created for the write, and the write must
+   not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
+   set.
+   The file range to write must be aligned to satisfy the requirements
+   of both the filesystem and the underlying block device's atomic
+   commit capabilities.
+   If filesystem metadata updates are required (e.g. unwritten extent
+   conversion or copy on write), all updates for the entire file range
+   must be committed atomically as well.
+
 Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
 calling this function.
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0706a3..ed4764e3b8f0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@  static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
  * clearing the WRITE_THROUGH flag in the dio request.
  */
 static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
-		const struct iomap *iomap, bool use_fua)
+		const struct iomap *iomap, bool use_fua, bool atomic)
 {
 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
 
@@ -283,6 +283,8 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 		opflags |= REQ_FUA;
 	else
 		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+	if (atomic)
+		opflags |= REQ_ATOMIC;
 
 	return opflags;
 }
@@ -293,7 +295,8 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
-	loff_t length = iomap_length(iter);
+	const loff_t length = iomap_length(iter);
+	bool atomic = iter->flags & IOMAP_ATOMIC;
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
@@ -303,6 +306,9 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
+	if (atomic && length != fs_block_size)
+		return -EINVAL;
+
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
@@ -382,7 +388,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	 * can set up the page vector appropriately for a ZONE_APPEND
 	 * operation.
 	 */
-	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
 
 	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
@@ -415,6 +421,17 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (WARN_ON_ONCE(atomic && n != length)) {
+			/*
+			 * This bio should have covered the complete length,
+			 * which it doesn't, so error. We may need to zero out
+			 * the tail (complete FS block), similar to when
+			 * bio_iov_iter_get_pages() returns an error, above.
+			 */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto zero_tail;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {
@@ -598,6 +615,9 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iomi.flags |= IOMAP_ATOMIC;
+
 	if (iov_iter_rw(iter) == READ) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +679,17 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			if (ret != -EAGAIN) {
 				trace_iomap_dio_invalidate_fail(inode, iomi.pos,
 								iomi.len);
-				ret = -ENOTBLK;
+				if (iocb->ki_flags & IOCB_ATOMIC) {
+					/*
+					 * folio invalidation failed, maybe
+					 * this is transient, unlock and see if
+					 * the caller tries again.
+					 */
+					ret = -EAGAIN;
+				} else {
+					/* fall back to buffered write */
+					ret = -ENOTBLK;
+				}
 			}
 			goto out_free_dio;
 		}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@  DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index d0420e962ffd..84282db3e4c1 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@  struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*