diff mbox series

[v3,14/21] iomap: Sub-extent zeroing

Message ID 20240429174746.2132161-15-john.g.garry@oracle.com (mailing list archive)
State Accepted, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry April 29, 2024, 5:47 p.m. UTC
For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
unwritten.

This will be important for atomic writes support, in that atomically
writing over a partially written extent would mean that we would need to
do the unwritten extent conversion write separately, and the write could
no longer be atomic.

It is the task of the FS to set iomap.extent_size per iter to indicate
sub-extent zeroing required.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c  | 17 +++++++++++------
 include/linux/iomap.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

Comments

Dave Chinner May 1, 2024, 1:07 a.m. UTC | #1
On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
> unwritten.
> 
> This will be important for atomic writes support, in that atomically
> writing over a partially written extent would mean that we would need to
> do the unwritten extent conversion write separately, and the write could
> no longer be atomic.
> 
> It is the task of the FS to set iomap.extent_size per iter to indicate
> sub-extent zeroing required.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>

Shouldn't this be done before the XFS feature is enabled in the
series?

> ---
>  fs/iomap/direct-io.c  | 17 +++++++++++------
>  include/linux/iomap.h |  1 +
>  2 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..a3ed7cfa95bc 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  {
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int zeroing_size, pad;
>  	loff_t length = iomap_length(iter);
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> +	if (iomap->extent_size)
> +		zeroing_size = iomap->extent_size;
> +	else
> +		zeroing_size = i_blocksize(inode);

Oh, the dissonance!

iomap->extent_size isn't an extent size at all.

The size of the extent the iomap returns is iomap->length. This new
variable is the IO specific "block size" that should be assumed by
the dio code to determine if padding should be done.

IOWs, I think we should add an "io_block_size" field to the iomap,
and every filesystem that supports iomap should set it to the
filesystem block size (i_blocksize(inode)). Then the changes to the
iomap code end up just being:


-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int fs_block_size = iomap->io_block_size, pad;

And the patch that introduces that infrastructure change will also
change all the filesystem implementations to unconditionally set
iomap->io_block_size to i_blocksize().

Then, in a separate patch, you can add XFS support for large IO
block sizes when we have either a large rtextsize or extent size
hints set.

> +
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>  		return -EINVAL;
> @@ -354,8 +359,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		dio->iocb->ki_flags &= ~IOCB_HIPRI;
>  
>  	if (need_zeroout) {
> -		/* zero out from the start of the block to the write offset */
> -		pad = pos & (fs_block_size - 1);
> +		/* zero out from the start of the region to the write offset */
> +		pad = pos & (zeroing_size - 1);
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
>  	}
> @@ -428,10 +433,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  zero_tail:
>  	if (need_zeroout ||
>  	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
> -		/* zero out from the end of the write to the end of the block */
> -		pad = pos & (fs_block_size - 1);
> +		/* zero out from the end of the write to the end of the region */
> +		pad = pos & (zeroing_size - 1);
>  		if (pad)
> -			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
> +			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
>  	}
>  out:
>  	/* Undo iter limitation to current extent */
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6fc1c858013d..42623b1cdc04 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -97,6 +97,7 @@ struct iomap {
>  	u64			length;	/* length of mapping, bytes */
>  	u16			type;	/* type of mapping */
>  	u16			flags;	/* flags for mapping */
> +	unsigned int		extent_size;

This needs a descriptive comment. At minimum, it should tell the
reader what units are used for the variable.  If it is bytes, then
it needs to be a u64, because XFS can have extent size hints well
beyond 2^32 bytes in length.

-Dave.
John Garry May 1, 2024, 10:23 a.m. UTC | #2
On 01/05/2024 02:07, Dave Chinner wrote:
> On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
>> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
>> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
>> unwritten.
>>
>> This will be important for atomic writes support, in that atomically
>> writing over a partially written extent would mean that we would need to
>> do the unwritten extent conversion write separately, and the write could
>> no longer be atomic.
>>
>> It is the task of the FS to set iomap.extent_size per iter to indicate
>> sub-extent zeroing required.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
> 
> Shouldn't this be done before the XFS feature is enabled in the
> series?

Well, it is done before XFS iomap zeroing support patch. But I can move 
this patch to the very beginning of the series.

> 
>> ---
>>   fs/iomap/direct-io.c  | 17 +++++++++++------
>>   include/linux/iomap.h |  1 +
>>   2 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index f3b43d223a46..a3ed7cfa95bc 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   {
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>> -	unsigned int fs_block_size = i_blocksize(inode), pad;
>> +	unsigned int zeroing_size, pad;
>>   	loff_t length = iomap_length(iter);
>>   	loff_t pos = iter->pos;
>>   	blk_opf_t bio_opf;
>> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	size_t copied = 0;
>>   	size_t orig_count;
>>   
>> +	if (iomap->extent_size)
>> +		zeroing_size = iomap->extent_size;
>> +	else
>> +		zeroing_size = i_blocksize(inode);
> 
> Oh, the dissonance!
> 
> iomap->extent_size isn't an extent size at all.

Right, it's a poorly chosen name

> 
> The size of the extent the iomap returns is iomap->length. This new
> variable is the IO specific "block size" that should be assumed by
> the dio code to determine if padding should be done.
> 
> IOWs, I think we should add an "io_block_size" field to the iomap,
> and every filesystem that supports iomap should set it to the
> filesystem block size (i_blocksize(inode)). Then the changes to the
> iomap code end up just being:
> 
> 
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int fs_block_size = iomap->io_block_size, pad;
> 
> And the patch that introduces that infrastructure change will also
> change all the filesystem implementations to unconditionally set
> iomap->io_block_size to i_blocksize().

ok

> 
> Then, in a separate patch, you can add XFS support for large IO
> block sizes when we have either a large rtextsize or extent size
> hints set.

I hadn't been considering large rtextsize for this. I suppose that it 
could be added.

> 
>> +
>>   	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>>   	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>>   		return -EINVAL;
>> @@ -354,8 +359,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		dio->iocb->ki_flags &= ~IOCB_HIPRI;
>>   
>>   	if (need_zeroout) {
>> -		/* zero out from the start of the block to the write offset */
>> -		pad = pos & (fs_block_size - 1);
>> +		/* zero out from the start of the region to the write offset */
>> +		pad = pos & (zeroing_size - 1);
>>   		if (pad)
>>   			iomap_dio_zero(iter, dio, pos - pad, pad);
>>   	}
>> @@ -428,10 +433,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   zero_tail:
>>   	if (need_zeroout ||
>>   	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
>> -		/* zero out from the end of the write to the end of the block */
>> -		pad = pos & (fs_block_size - 1);
>> +		/* zero out from the end of the write to the end of the region */
>> +		pad = pos & (zeroing_size - 1);
>>   		if (pad)
>> -			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
>> +			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
>>   	}
>>   out:
>>   	/* Undo iter limitation to current extent */
>> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
>> index 6fc1c858013d..42623b1cdc04 100644
>> --- a/include/linux/iomap.h
>> +++ b/include/linux/iomap.h
>> @@ -97,6 +97,7 @@ struct iomap {
>>   	u64			length;	/* length of mapping, bytes */
>>   	u16			type;	/* type of mapping */
>>   	u16			flags;	/* flags for mapping */
>> +	unsigned int		extent_size;
> 
> This needs a descriptive comment. At minimum, it should tell the
> reader what units are used for the variable.  If it is bytes, then
> it needs to be a u64, because XFS can have extent size hints well
> beyond 2^32 bytes in length.
> 

ok

Thanks,
John
John Garry May 30, 2024, 10:40 a.m. UTC | #3
On 01/05/2024 02:07, Dave Chinner wrote:
>>   	blk_opf_t bio_opf;
>> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	size_t copied = 0;
>>   	size_t orig_count;
>>   
>> +	if (iomap->extent_size)
>> +		zeroing_size = iomap->extent_size;
>> +	else
>> +		zeroing_size = i_blocksize(inode);
> Oh, the dissonance!
> 
> iomap->extent_size isn't an extent size at all.
> 
> The size of the extent the iomap returns is iomap->length. This new
> variable is the IO specific "block size" that should be assumed by
> the dio code to determine if padding should be done.
> 
> IOWs, I think we should add an "io_block_size" field to the iomap,
> and every filesystem that supports iomap should set it to the
> filesystem block size (i_blocksize(inode)). Then the changes to the
> iomap code end up just being:
> 
> 
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int fs_block_size = iomap->io_block_size, pad;
> 
> And the patch that introduces that infrastructure change will also
> change all the filesystem implementations to unconditionally set
> iomap->io_block_size to i_blocksize().

JFYI, this is how that change looks:

----8<----

Subject: [PATCH] iomap: Allow filesystens set sub-fs block zeroing size

Allow filesystens to set the sub-fs block zero size, as in future we will
want to extend this feature to support zeroing of block sizes of larger
than the inode block size.

Signed-off-by: John Garry <john.g.garry@oracle.com>

diff --git a/block/fops.c b/block/fops.c
index 9d6d86ebefb9..020443078630 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -402,6 +402,7 @@ static int blkdev_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,
  	iomap->addr = iomap->offset;
  	iomap->length = isize - iomap->offset;
  	iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
+	iomap->io_block_size = i_blocksize(inode);
  	return 0;
  }

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..665811b1578b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7740,6 +7740,7 @@ static int btrfs_dio_iomap_begin(struct inode 
*inode, loff_t start,
  	iomap->offset = start;
  	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
  	iomap->length = len;
+	iomap->io_block_size = i_blocksize(inode);
  	free_extent_map(em);

  	return 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8be60797ea2f..ea9d2f3eadb3 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -305,6 +305,7 @@ static int erofs_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,
  		if (flags & IOMAP_DAX)
  			iomap->addr += mdev.m_dax_part_off;
  	}
+	iomap->io_block_size = i_blocksize(inode);
  	return 0;
  }

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9b248ee5fef2..6ee89f6a078c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -749,6 +749,7 @@ static int z_erofs_iomap_begin_report(struct inode 
*inode, loff_t offset,
  		if (iomap->offset >= inode->i_size)
  			iomap->length = length + offset - map.m_la;
  	}
+	iomap->io_block_size = i_blocksize(inode);
  	iomap->flags = 0;
  	return 0;
  }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0caa1650cee8..7a5539a52844 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -862,6 +862,7 @@ static int ext2_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,
  		iomap->length = (u64)ret << blkbits;
  		iomap->flags |= IOMAP_F_MERGED;
  	}
+	iomap->io_block_size = i_blocksize(inode);

  	if (new)
  		iomap->flags |= IOMAP_F_NEW;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e067f2dd0335..ce3269874fde 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4933,6 +4933,7 @@ static int ext4_iomap_xattr_fiemap(struct inode 
*inode, struct iomap *iomap)
  	iomap->length = length;
  	iomap->type = iomap_type;
  	iomap->flags = 0;
+	iomap->io_block_size = i_blocksize(inode);
  out:
  	return error;
  }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4bae9ccf5fe0..3ec82e4d71c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3235,6 +3235,7 @@ static void ext4_set_iomap(struct inode *inode, 
struct iomap *iomap,
  		iomap->bdev = inode->i_sb->s_bdev;
  	iomap->offset = (u64) map->m_lblk << blkbits;
  	iomap->length = (u64) map->m_len << blkbits;
+	iomap->io_block_size = i_blocksize(inode);

  	if ((map->m_flags & EXT4_MAP_MAPPED) &&
  	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b9b0debc6b3d..6c12641b9a7b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -4233,6 +4233,7 @@ static int f2fs_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,
  		}
  		iomap->addr = IOMAP_NULL_ADDR;
  	}
+	iomap->io_block_size = i_blocksize(inode);

  	if (map.m_flags & F2FS_MAP_NEW)
  		iomap->flags |= IOMAP_F_NEW;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 12ef91d170bb..68ddc74cb31e 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -577,6 +577,7 @@ static int fuse_iomap_begin(struct inode *inode, 
loff_t pos, loff_t length,
  	iomap->flags = 0;
  	iomap->bdev = NULL;
  	iomap->dax_dev = fc->dax->dev;
+	iomap->io_block_size = i_blocksize(inode);

  	/*
  	 * Both read/write and mmap path can race here. So we need something
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..8d2de42b1da9 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -927,6 +927,7 @@ static int __gfs2_iomap_get(struct inode *inode, 
loff_t pos, loff_t length,

  out:
  	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->io_block_size = i_blocksize(inode);
  unlock:
  	up_read(&ip->i_rw_mutex);
  	return ret;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 1bb8d97cd9ae..5d2718faf520 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -149,6 +149,7 @@ static int hpfs_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,
  		iomap->addr = IOMAP_NULL_ADDR;
  		iomap->length = 1 << blkbits;
  	}
+	iomap->io_block_size = i_blocksize(inode);

  	hpfs_unlock(sb);
  	return 0;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..1e6eb59cac6c 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct 
iomap_iter *iter,
  {
  	const struct iomap *iomap = &iter->iomap;
  	struct inode *inode = iter->inode;
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	u64 io_block_size = iomap->io_block_size;
  	loff_t length = iomap_length(iter);
  	loff_t pos = iter->pos;
  	blk_opf_t bio_opf;
@@ -287,6 +287,7 @@ static loff_t iomap_dio_bio_iter(const struct 
iomap_iter *iter,
  	int nr_pages, ret = 0;
  	size_t copied = 0;
  	size_t orig_count;
+	unsigned int pad;

  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
@@ -355,7 +356,7 @@ static loff_t iomap_dio_bio_iter(const struct 
iomap_iter *iter,

  	if (need_zeroout) {
  		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (io_block_size - 1);
  		if (pad)
  			iomap_dio_zero(iter, dio, pos - pad, pad);
  	}
@@ -429,9 +430,9 @@ static loff_t iomap_dio_bio_iter(const struct 
iomap_iter *iter,
  	if (need_zeroout ||
  	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
  		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
+		pad = pos & (io_block_size - 1);
  		if (pad)
-			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+			iomap_dio_zero(iter, dio, pos, io_block_size - pad);
  	}
  out:
  	/* Undo iter limitation to current extent */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 378342673925..ecb4cae88248 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -127,6 +127,7 @@ xfs_bmbt_to_iomap(
  	}
  	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
  	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	iomap->io_block_size = i_blocksize(VFS_I(ip));
  	if (mapping_flags & IOMAP_DAX)
  		iomap->dax_dev = target->bt_daxdev;
  	else
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 3b103715acc9..bf2cc4bee309 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -50,6 +50,7 @@ static int zonefs_read_iomap_begin(struct inode 
*inode, loff_t offset,
  		iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  		iomap->length = isize - iomap->offset;
  	}
+	iomap->io_block_size = i_blocksize(inode);
  	mutex_unlock(&zi->i_truncate_mutex);

  	trace_zonefs_iomap_begin(inode, iomap);
@@ -99,6 +100,7 @@ static int zonefs_write_iomap_begin(struct inode 
*inode, loff_t offset,
  		iomap->type = IOMAP_MAPPED;
  		iomap->length = isize - iomap->offset;
  	}
+	iomap->io_block_size = i_blocksize(inode);
  	mutex_unlock(&zi->i_truncate_mutex);

  	trace_zonefs_iomap_begin(inode, iomap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..c6ae6fdcec00 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -103,6 +103,7 @@ struct iomap {
  	void			*private; /* filesystem private */
  	const struct iomap_folio_ops *folio_ops;
  	u64			validity_cookie; /* used with .iomap_valid() */
+	u64			io_block_size; /* sub-FS block zeroing size  */
  };

  static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)


---->8----

That's a lot changes... in addition, if rtextsize is to be considered in 
setting io_block_size, what about ext4 bigalloc and other similar features?

> 
> Then, in a separate patch, you can add XFS support for large IO
> block sizes when we have either a large rtextsize or extent size
> hints set.



> 
>> +
>>   	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>>   	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
Long Li June 11, 2024, 3:10 a.m. UTC | #4
On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
> unwritten.
> 
> This will be important for atomic writes support, in that atomically
> writing over a partially written extent would mean that we would need to
> do the unwritten extent conversion write separately, and the write could
> no longer be atomic.
> 
> It is the task of the FS to set iomap.extent_size per iter to indicate
> sub-extent zeroing required.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c  | 17 +++++++++++------
>  include/linux/iomap.h |  1 +
>  2 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..a3ed7cfa95bc 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  {
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int zeroing_size, pad;
>  	loff_t length = iomap_length(iter);
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> +	if (iomap->extent_size)
> +		zeroing_size = iomap->extent_size;
> +	else
> +		zeroing_size = i_blocksize(inode);
> +
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>  		return -EINVAL;
> @@ -354,8 +359,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		dio->iocb->ki_flags &= ~IOCB_HIPRI;
>  
>  	if (need_zeroout) {
> -		/* zero out from the start of the block to the write offset */
> -		pad = pos & (fs_block_size - 1);
> +		/* zero out from the start of the region to the write offset */
> +		pad = pos & (zeroing_size - 1);
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
 
Hi, John

I've been testing and using your atomic write patch series recently. I noticed
that if zeroing_size is larger than a single page, the length passed to
iomap_dio_zero() could also be larger than a page size. This seems incorrect
because iomap_dio_zero() utilizes ZERO_PAGE(0), which is only a single page
in size.

Thanks,
Long Li
John Garry June 11, 2024, 7:29 a.m. UTC | #5
On 11/06/2024 04:10, Long Li wrote:
>>   	if (need_zeroout) {
>> -		/* zero out from the start of the block to the write offset */
>> -		pad = pos & (fs_block_size - 1);
>> +		/* zero out from the start of the region to the write offset */
>> +		pad = pos & (zeroing_size - 1);
>>   		if (pad)
>>   			iomap_dio_zero(iter, dio, pos - pad, pad);
>   
> Hi, John
> 
> I've been testing and using your atomic write patch series recently. I noticed
> that if zeroing_size is larger than a single page, the length passed to
> iomap_dio_zero() could also be larger than a page size. This seems incorrect
> because iomap_dio_zero() utilizes ZERO_PAGE(0), which is only a single page
> in size.

ok, thanks for the notice.

So 
https://lore.kernel.org/linux-xfs/20240607145902.1137853-1-kernel@pankajraghav.com/T/#m7ba4ed4f0f0f48be99042703c10b42b72c9fe37c 
is changing that same function increase the zero range past PAGE_SIZE. 
I'll just need to figure out how to make it support an arbitrary larger 
size.

Thanks,
John
John Garry July 26, 2024, 2:29 p.m. UTC | #6
On 01/05/2024 02:07, Dave Chinner wrote:

[trim list a bit]

> On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
>> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
>> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
>> unwritten.
>>
>> This will be important for atomic writes support, in that atomically
>> writing over a partially written extent would mean that we would need to
>> do the unwritten extent conversion write separately, and the write could
>> no longer be atomic.

I have been considering another approach to solve this problem.

In this patch - as you know - we zero unwritten parts of a newly 
allocated extent. This is so that when we later issue an atomic write, 
we would not have the problem of unwritten extents and how the iomap 
iterator will create multiple BIOs (which is not permitted).

How about an alternate approach like this:
- no sub-extent zeroing
- iomap iter is changed to allocate a single BIO for an atomic write in 
first iteration
- each iomap extent iteration appends data to that same BIO
- when finished iterating, we submit the BIO

Obviously that will mean many changes to the iomap bio iterator, but is 
quite self-contained.

John
Christoph Hellwig July 26, 2024, 5:13 p.m. UTC | #7
On Fri, Jul 26, 2024 at 03:29:48PM +0100, John Garry wrote:
> I have been considering another approach to solve this problem.
>
> In this patch - as you know - we zero unwritten parts of a newly allocated 
> extent. This is so that when we later issue an atomic write, we would not 
> have the problem of unwritten extents and how the iomap iterator will 
> create multiple BIOs (which is not permitted).
>
> How about an alternate approach like this:
> - no sub-extent zeroing
> - iomap iter is changed to allocate a single BIO for an atomic write in 
> first iteration
> - each iomap extent iteration appends data to that same BIO
> - when finished iterating, we submit the BIO
>
> Obviously that will mean many changes to the iomap bio iterator, but is 
> quite self-contained.

Yes, I also suggested that during the zeroing fix discussion.  There
is generally no good reason to start a new direct I/O bio if the
write is contiguous on disk and only the state of the srcmap is different.
This will also be a big win for COW / out of place overwrites.
John Garry July 29, 2024, 5:02 p.m. UTC | #8
On 26/07/2024 18:13, Christoph Hellwig wrote:
> On Fri, Jul 26, 2024 at 03:29:48PM +0100, John Garry wrote:
>> I have been considering another approach to solve this problem.
>>
>> In this patch - as you know - we zero unwritten parts of a newly allocated
>> extent. This is so that when we later issue an atomic write, we would not
>> have the problem of unwritten extents and how the iomap iterator will
>> create multiple BIOs (which is not permitted).
>>
>> How about an alternate approach like this:
>> - no sub-extent zeroing
>> - iomap iter is changed to allocate a single BIO for an atomic write in
>> first iteration
>> - each iomap extent iteration appends data to that same BIO
>> - when finished iterating, we submit the BIO
>>
>> Obviously that will mean many changes to the iomap bio iterator, but is
>> quite self-contained.
> 
> Yes, I also suggested that during the zeroing fix discussion. 

Maybe missed that. I did notice 
https://lore.kernel.org/linux-xfs/ZmwJuiMHQ8qgkJDS@infradead.org, but 
got a different impression of your idea there (to this one).

> There
> is generally no good reason to start a new direct I/O bio if the
> write is contiguous on disk and only the state of the srcmap is different.

Sure, so we don't need to worry about partially-completed writes, if 
that was a concern; and it would also mean dropping that unpleasant code 
change in xfs_iomap_write_unwritten() where the start/count fsb were 
being rounded out to the extent granule boundary.

> This will also be a big win for COW / out of place overwrites.
>
Darrick J. Wong Aug. 22, 2024, 8:35 p.m. UTC | #9
On Fri, Jul 26, 2024 at 07:13:58PM +0200, Christoph Hellwig wrote:
> On Fri, Jul 26, 2024 at 03:29:48PM +0100, John Garry wrote:
> > I have been considering another approach to solve this problem.
> >
> > In this patch - as you know - we zero unwritten parts of a newly allocated 
> > extent. This is so that when we later issue an atomic write, we would not 
> > have the problem of unwritten extents and how the iomap iterator will 
> > create multiple BIOs (which is not permitted).
> >
> > How about an alternate approach like this:
> > - no sub-extent zeroing
> > - iomap iter is changed to allocate a single BIO for an atomic write in 
> > first iteration
> > - each iomap extent iteration appends data to that same BIO
> > - when finished iterating, we submit the BIO
> >
> > Obviously that will mean many changes to the iomap bio iterator, but is 
> > quite self-contained.
> 
> Yes, I also suggested that during the zeroing fix discussion.  There
> is generally no good reason to start a new direct I/O bio if the
> write is contiguous on disk and only the state of the srcmap is different.
> This will also be a big win for COW / out of place overwrites.

But what happens if the pre-write state is:

WUWUWUWU

You can write all 8 blocks with a single bio, but the directio write
completion has to run four separate transactions to convert the four
unwritten mappings.  For COW it's ok if we crash midway through the
ioend such that a read after recovery sees this:

WWWWW0W0

because we've never guaranteed what happens if the system crashes before
fsync completes.  For untorn writes this is not allowed (even if the
actual disk contents landed successfully) because we said we wouldn't
tear the write.

--D
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..a3ed7cfa95bc 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -277,7 +277,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 {
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int zeroing_size, pad;
 	loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
@@ -288,6 +288,11 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
+	if (iomap->extent_size)
+		zeroing_size = iomap->extent_size;
+	else
+		zeroing_size = i_blocksize(inode);
+
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
@@ -354,8 +359,8 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		dio->iocb->ki_flags &= ~IOCB_HIPRI;
 
 	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
+		/* zero out from the start of the region to the write offset */
+		pad = pos & (zeroing_size - 1);
 		if (pad)
 			iomap_dio_zero(iter, dio, pos - pad, pad);
 	}
@@ -428,10 +433,10 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 zero_tail:
 	if (need_zeroout ||
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
+		/* zero out from the end of the write to the end of the region */
+		pad = pos & (zeroing_size - 1);
 		if (pad)
-			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
 	}
 out:
 	/* Undo iter limitation to current extent */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..42623b1cdc04 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -97,6 +97,7 @@  struct iomap {
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
+	unsigned int		extent_size;
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;