diff mbox series

[v3,14/21] iomap: Sub-extent zeroing

Message ID 20240429174746.2132161-15-john.g.garry@oracle.com (mailing list archive)
State New
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry April 29, 2024, 5:47 p.m. UTC
For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
unwritten.

This will be important for atomic writes support, in that atomically
writing over a partially written extent would mean that we would need to
do the unwritten extent conversion write separately, and the write could
no longer be atomic.

It is the task of the FS to set iomap.extent_size per iter to indicate
sub-extent zeroing required.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c  | 17 +++++++++++------
 include/linux/iomap.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

Comments

Dave Chinner May 1, 2024, 1:07 a.m. UTC | #1
On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
> unwritten.
> 
> This will be important for atomic writes support, in that atomically
> writing over a partially written extent would mean that we would need to
> do the unwritten extent conversion write separately, and the write could
> no longer be atomic.
> 
> It is the task of the FS to set iomap.extent_size per iter to indicate
> sub-extent zeroing required.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>

Shouldn't this be done before the XFS feature is enabled in the
series?

> ---
>  fs/iomap/direct-io.c  | 17 +++++++++++------
>  include/linux/iomap.h |  1 +
>  2 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..a3ed7cfa95bc 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  {
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int zeroing_size, pad;
>  	loff_t length = iomap_length(iter);
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> +	if (iomap->extent_size)
> +		zeroing_size = iomap->extent_size;
> +	else
> +		zeroing_size = i_blocksize(inode);

Oh, the dissonance!

iomap->extent_size isn't an extent size at all.

The size of the extent the iomap returns is iomap->length. This new
variable is the IO specific "block size" that should be assumed by
the dio code to determine if padding should be done.

IOWs, I think we should add an "io_block_size" field to the iomap,
and every filesystem that supports iomap should set it to the
filesystem block size (i_blocksize(inode)). Then the changes to the
iomap code end up just being:


-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int fs_block_size = iomap->io_block_size, pad;

And the patch that introduces that infrastructure change will also
change all the filesystem implementations to unconditionally set
iomap->io_block_size to i_blocksize().

Then, in a separate patch, you can add XFS support for large IO
block sizes when we have either a large rtextsize or extent size
hints set.

> +
>  	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>  	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>  		return -EINVAL;
> @@ -354,8 +359,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		dio->iocb->ki_flags &= ~IOCB_HIPRI;
>  
>  	if (need_zeroout) {
> -		/* zero out from the start of the block to the write offset */
> -		pad = pos & (fs_block_size - 1);
> +		/* zero out from the start of the region to the write offset */
> +		pad = pos & (zeroing_size - 1);
>  		if (pad)
>  			iomap_dio_zero(iter, dio, pos - pad, pad);
>  	}
> @@ -428,10 +433,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  zero_tail:
>  	if (need_zeroout ||
>  	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
> -		/* zero out from the end of the write to the end of the block */
> -		pad = pos & (fs_block_size - 1);
> +		/* zero out from the end of the write to the end of the region */
> +		pad = pos & (zeroing_size - 1);
>  		if (pad)
> -			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
> +			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
>  	}
>  out:
>  	/* Undo iter limitation to current extent */
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6fc1c858013d..42623b1cdc04 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -97,6 +97,7 @@ struct iomap {
>  	u64			length;	/* length of mapping, bytes */
>  	u16			type;	/* type of mapping */
>  	u16			flags;	/* flags for mapping */
> +	unsigned int		extent_size;

This needs a descriptive comment. At minimum, it should tell the
reader what units are used for the variable.  If it is bytes, then
it needs to be a u64, because XFS can have extent size hints well
beyond 2^32 bytes in length.

-Dave.
John Garry May 1, 2024, 10:23 a.m. UTC | #2
On 01/05/2024 02:07, Dave Chinner wrote:
> On Mon, Apr 29, 2024 at 05:47:39PM +0000, John Garry wrote:
>> For FS_XFLAG_FORCEALIGN support, we want to treat any sub-extent IO like
>> sub-fsblock DIO, in that we will zero the sub-extent when the mapping is
>> unwritten.
>>
>> This will be important for atomic writes support, in that atomically
>> writing over a partially written extent would mean that we would need to
>> do the unwritten extent conversion write separately, and the write could
>> no longer be atomic.
>>
>> It is the task of the FS to set iomap.extent_size per iter to indicate
>> sub-extent zeroing required.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
> 
> Shouldn't this be done before the XFS feature is enabled in the
> series?

Well, it is done before XFS iomap zeroing support patch. But I can move 
this patch to the very beginning of the series.

> 
>> ---
>>   fs/iomap/direct-io.c  | 17 +++++++++++------
>>   include/linux/iomap.h |  1 +
>>   2 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index f3b43d223a46..a3ed7cfa95bc 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -277,7 +277,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   {
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>> -	unsigned int fs_block_size = i_blocksize(inode), pad;
>> +	unsigned int zeroing_size, pad;
>>   	loff_t length = iomap_length(iter);
>>   	loff_t pos = iter->pos;
>>   	blk_opf_t bio_opf;
>> @@ -288,6 +288,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   	size_t copied = 0;
>>   	size_t orig_count;
>>   
>> +	if (iomap->extent_size)
>> +		zeroing_size = iomap->extent_size;
>> +	else
>> +		zeroing_size = i_blocksize(inode);
> 
> Oh, the dissonance!
> 
> iomap->extent_size isn't an extent size at all.

Right, it's a poorly chosen name

> 
> The size of the extent the iomap returns is iomap->length. This new
> variable is the IO specific "block size" that should be assumed by
> the dio code to determine if padding should be done.
> 
> IOWs, I think we should add an "io_block_size" field to the iomap,
> and every filesystem that supports iomap should set it to the
> filesystem block size (i_blocksize(inode)). Then the changes to the
> iomap code end up just being:
> 
> 
> -	unsigned int fs_block_size = i_blocksize(inode), pad;
> +	unsigned int fs_block_size = iomap->io_block_size, pad;
> 
> And the patch that introduces that infrastructure change will also
> change all the filesystem implementations to unconditionally set
> iomap->io_block_size to i_blocksize().

ok

> 
> Then, in a separate patch, you can add XFS support for large IO
> block sizes when we have either a large rtextsize or extent size
> hints set.

I hadn't been considering large rtextsize for this. I suppose that it 
could be added.

> 
>> +
>>   	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
>>   	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
>>   		return -EINVAL;
>> @@ -354,8 +359,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		dio->iocb->ki_flags &= ~IOCB_HIPRI;
>>   
>>   	if (need_zeroout) {
>> -		/* zero out from the start of the block to the write offset */
>> -		pad = pos & (fs_block_size - 1);
>> +		/* zero out from the start of the region to the write offset */
>> +		pad = pos & (zeroing_size - 1);
>>   		if (pad)
>>   			iomap_dio_zero(iter, dio, pos - pad, pad);
>>   	}
>> @@ -428,10 +433,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   zero_tail:
>>   	if (need_zeroout ||
>>   	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
>> -		/* zero out from the end of the write to the end of the block */
>> -		pad = pos & (fs_block_size - 1);
>> +		/* zero out from the end of the write to the end of the region */
>> +		pad = pos & (zeroing_size - 1);
>>   		if (pad)
>> -			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
>> +			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
>>   	}
>>   out:
>>   	/* Undo iter limitation to current extent */
>> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
>> index 6fc1c858013d..42623b1cdc04 100644
>> --- a/include/linux/iomap.h
>> +++ b/include/linux/iomap.h
>> @@ -97,6 +97,7 @@ struct iomap {
>>   	u64			length;	/* length of mapping, bytes */
>>   	u16			type;	/* type of mapping */
>>   	u16			flags;	/* flags for mapping */
>> +	unsigned int		extent_size;
> 
> This needs a descriptive comment. At minimum, it should tell the
> reader what units are used for the variable.  If it is bytes, then
> it needs to be a u64, because XFS can have extent size hints well
> beyond 2^32 bytes in length.
> 

ok

Thanks,
John
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..a3ed7cfa95bc 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -277,7 +277,7 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 {
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
-	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int zeroing_size, pad;
 	loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
@@ -288,6 +288,11 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
+	if (iomap->extent_size)
+		zeroing_size = iomap->extent_size;
+	else
+		zeroing_size = i_blocksize(inode);
+
 	if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
 	    !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
 		return -EINVAL;
@@ -354,8 +359,8 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		dio->iocb->ki_flags &= ~IOCB_HIPRI;
 
 	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
+		/* zero out from the start of the region to the write offset */
+		pad = pos & (zeroing_size - 1);
 		if (pad)
 			iomap_dio_zero(iter, dio, pos - pad, pad);
 	}
@@ -428,10 +433,10 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 zero_tail:
 	if (need_zeroout ||
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
+		/* zero out from the end of the write to the end of the region */
+		pad = pos & (zeroing_size - 1);
 		if (pad)
-			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+			iomap_dio_zero(iter, dio, pos, zeroing_size - pad);
 	}
 out:
 	/* Undo iter limitation to current extent */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..42623b1cdc04 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -97,6 +97,7 @@  struct iomap {
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
+	unsigned int		extent_size;
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;