diff mbox series

[v6,3/7] fs: iomap: Atomic write support

Message ID 20240930125438.2501050-4-john.g.garry@oracle.com (mailing list archive)
State Not Applicable, archived
Headers show
Series block atomic writes for xfs | expand

Commit Message

John Garry Sept. 30, 2024, 12:54 p.m. UTC
Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
flag set.

Initially FSes (XFS) should only support writing a single FS block
atomically.

As with any atomic write, we should produce a single bio which covers the
complete write length.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
Maybe we should also enforce that a mapping is in written state, as it
avoids issues later with forcealign and writing mappings which cover
multiple extents in different written/unwritten state.

 fs/iomap/direct-io.c  | 26 +++++++++++++++++++++++---
 fs/iomap/trace.h      |  3 ++-
 include/linux/iomap.h |  1 +
 3 files changed, 26 insertions(+), 4 deletions(-)

Comments

Darrick J. Wong Sept. 30, 2024, 3:55 p.m. UTC | #1
On Mon, Sep 30, 2024 at 12:54:34PM +0000, John Garry wrote:
> Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
> flag set.
> 
> Initially FSes (XFS) should only support writing a single FS block
> atomically.
> 
> As with any atomic write, we should produce a single bio which covers the
> complete write length.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> Maybe we should also enforce that a mapping is in written state, as it
> avoids issues later with forcealign and writing mappings which cover
> multiple extents in different written/unwritten state.
> 
>  fs/iomap/direct-io.c  | 26 +++++++++++++++++++++++---
>  fs/iomap/trace.h      |  3 ++-
>  include/linux/iomap.h |  1 +
>  3 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f637aa0706a3..9401c05cd2c0 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
>   * clearing the WRITE_THROUGH flag in the dio request.
>   */
>  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> -		const struct iomap *iomap, bool use_fua)
> +		const struct iomap *iomap, bool use_fua, bool atomic)
>  {
>  	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
>  
> @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  		opflags |= REQ_FUA;
>  	else
>  		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> +	if (atomic)
> +		opflags |= REQ_ATOMIC;
>  
>  	return opflags;
>  }
> @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
>  	unsigned int fs_block_size = i_blocksize(inode), pad;
> -	loff_t length = iomap_length(iter);
> +	const loff_t length = iomap_length(iter);
> +	bool atomic = iter->flags & IOMAP_ATOMIC;
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
>  	struct bio *bio;
> @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> +	if (atomic && (length != fs_block_size))
> +		return -EINVAL;
> +
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>  		return -EINVAL;
> @@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	 * can set up the page vector appropriately for a ZONE_APPEND
>  	 * operation.
>  	 */
> -	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
> +	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
>  
>  	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
>  	do {
> @@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		}
>  
>  		n = bio->bi_iter.bi_size;
> +		if (atomic && n != length) {
> +			/*
> +			 * This bio should have covered the complete length,
> +			 * which it doesn't, so error. We may need to zero out
> +			 * the tail (complete FS block), similar to when
> +			 * bio_iov_iter_get_pages() returns an error, above.
> +			 */
> +			ret = -EINVAL;
> +			bio_put(bio);
> +			goto zero_tail;
> +		}
>  		if (dio->flags & IOMAP_DIO_WRITE) {
>  			task_io_account_write(n);
>  		} else {
> @@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (iocb->ki_flags & IOCB_NOWAIT)
>  		iomi.flags |= IOMAP_NOWAIT;
>  
> +	if (iocb->ki_flags & IOCB_ATOMIC)
> +		iomi.flags |= IOMAP_ATOMIC;
> +
>  	if (iov_iter_rw(iter) == READ) {
>  		/* reads can always complete inline */
>  		dio->flags |= IOMAP_DIO_INLINE_COMP;
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 0a991c4ce87d..4118a42cdab0 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>  	{ IOMAP_REPORT,		"REPORT" }, \
>  	{ IOMAP_FAULT,		"FAULT" }, \
>  	{ IOMAP_DIRECT,		"DIRECT" }, \
> -	{ IOMAP_NOWAIT,		"NOWAIT" }
> +	{ IOMAP_NOWAIT,		"NOWAIT" }, \
> +	{ IOMAP_ATOMIC,		"ATOMIC" }
>  
>  #define IOMAP_F_FLAGS_STRINGS \
>  	{ IOMAP_F_NEW,		"NEW" }, \
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 4ad12a3c8bae..c7644bdcfca3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -178,6 +178,7 @@ struct iomap_folio_ops {
>  #else
>  #define IOMAP_DAX		0
>  #endif /* CONFIG_FS_DAX */
> +#define IOMAP_ATOMIC		(1 << 9)

This new flag needs a documentation update.  What do you think of this?

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 8e6c721d23301..279db993be7fa 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -513,6 +513,16 @@ IOMAP_WRITE`` with any combination of the following enhancements:
    if the mapping is unwritten and the filesystem cannot handle zeroing
    the unaligned regions without exposing stale contents.
 
+ * ``IOMAP_ATOMIC``: This write must be persisted in its entirety or
+   not at all.
+   The write must not be split into multiple I/O requests.
+   The file range to write must be aligned to satisfy the requirements
+   of both the filesystem and the underlying block device's atomic
+   commit capabilities.
+   If filesystem metadata updates are required (e.g. unwritten extent
+   conversion or copy on write), all updates for the entire file range
+   must be committed atomically as well.
+
 Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
 calling this function.
 
--D

>  
>  struct iomap_ops {
>  	/*
> -- 
> 2.31.1
> 
>
John Garry Oct. 1, 2024, 8:05 a.m. UTC | #2
> 
> This new flag needs a documentation update.  What do you think of this?
> 
> diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
> index 8e6c721d23301..279db993be7fa 100644
> --- a/Documentation/filesystems/iomap/operations.rst
> +++ b/Documentation/filesystems/iomap/operations.rst
> @@ -513,6 +513,16 @@ IOMAP_WRITE`` with any combination of the following enhancements:
>      if the mapping is unwritten and the filesystem cannot handle zeroing
>      the unaligned regions without exposing stale contents.
>   
> + * ``IOMAP_ATOMIC``: This write must be persisted in its entirety or
> +   not at all.
> +   The write must not be split into multiple I/O requests.
> +   The file range to write must be aligned to satisfy the requirements
> +   of both the filesystem and the underlying block device's atomic
> +   commit capabilities.
> +   If filesystem metadata updates are required (e.g. unwritten extent
> +   conversion or copy on write), all updates for the entire file range
> +   must be committed atomically as well.
> +
>   Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
>   calling this function.
>   

Sure, but I would make a couple of tweaks to the beginning:

  * ``IOMAP_ATOMIC``: This write is to be be issued with torn-write
    protection. Only a single bio can be created for the write, and the
    bio must not be split into multiple I/O requests, i.e. flag
    REQ_ATOMIC must be set.
    The file range to write must be aligned to satisfy the requirements
    of both the filesystem and the underlying block device's atomic
    commit capabilities.
    If filesystem metadata updates are required (e.g. unwritten extent
    conversion or copy on write), all updates for the entire file range
    must be committed atomically as well.

ok?

Thanks,
John
Darrick J. Wong Oct. 1, 2024, 2:37 p.m. UTC | #3
On Tue, Oct 01, 2024 at 09:05:17AM +0100, John Garry wrote:
> 
> > 
> > This new flag needs a documentation update.  What do you think of this?
> > 
> > diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
> > index 8e6c721d23301..279db993be7fa 100644
> > --- a/Documentation/filesystems/iomap/operations.rst
> > +++ b/Documentation/filesystems/iomap/operations.rst
> > @@ -513,6 +513,16 @@ IOMAP_WRITE`` with any combination of the following enhancements:
> >      if the mapping is unwritten and the filesystem cannot handle zeroing
> >      the unaligned regions without exposing stale contents.
> > + * ``IOMAP_ATOMIC``: This write must be persisted in its entirety or
> > +   not at all.
> > +   The write must not be split into multiple I/O requests.
> > +   The file range to write must be aligned to satisfy the requirements
> > +   of both the filesystem and the underlying block device's atomic
> > +   commit capabilities.
> > +   If filesystem metadata updates are required (e.g. unwritten extent
> > +   conversion or copy on write), all updates for the entire file range
> > +   must be committed atomically as well.
> > +
> >   Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
> >   calling this function.
> 
> Sure, but I would make a couple of tweaks to the beginning:
> 
>  * ``IOMAP_ATOMIC``: This write is to be be issued with torn-write
>    protection. Only a single bio can be created for the write, and the
>    bio must not be split into multiple I/O requests, i.e. flag
>    REQ_ATOMIC must be set.
>    The file range to write must be aligned to satisfy the requirements
>    of both the filesystem and the underlying block device's atomic
>    commit capabilities.
>    If filesystem metadata updates are required (e.g. unwritten extent
>    conversion or copy on write), all updates for the entire file range
>    must be committed atomically as well.
> 
> ok?

Yep, sounds good.

--D

> Thanks,
> John
> 
> 
>
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0706a3..9401c05cd2c0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@  static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
  * clearing the WRITE_THROUGH flag in the dio request.
  */
 static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
-		const struct iomap *iomap, bool use_fua)
+		const struct iomap *iomap, bool use_fua, bool atomic)
 {
 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
 
@@ -283,6 +283,8 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 		opflags |= REQ_FUA;
 	else
 		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+	if (atomic)
+		opflags |= REQ_ATOMIC;
 
 	return opflags;
 }
@@ -293,7 +295,8 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
-	loff_t length = iomap_length(iter);
+	const loff_t length = iomap_length(iter);
+	bool atomic = iter->flags & IOMAP_ATOMIC;
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
@@ -303,6 +306,9 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
+	if (atomic && (length != fs_block_size))
+		return -EINVAL;
+
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
@@ -382,7 +388,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	 * can set up the page vector appropriately for a ZONE_APPEND
 	 * operation.
 	 */
-	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
 
 	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
@@ -415,6 +421,17 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (atomic && n != length) {
+			/*
+			 * This bio should have covered the complete length,
+			 * which it doesn't, so error. We may need to zero out
+			 * the tail (complete FS block), similar to when
+			 * bio_iov_iter_get_pages() returns an error, above.
+			 */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto zero_tail;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {
@@ -598,6 +615,9 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iomi.flags |= IOMAP_ATOMIC;
+
 	if (iov_iter_rw(iter) == READ) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@  DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4ad12a3c8bae..c7644bdcfca3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@  struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*