diff mbox series

[v2,2/7] iomap: Add zero unwritten mappings dio support

Message ID 20241210125737.786928-3-john.g.garry@oracle.com (mailing list archive)
State Not Applicable, archived
Headers show
Series large atomic writes for xfs | expand

Commit Message

John Garry Dec. 10, 2024, 12:57 p.m. UTC
For atomic writes support, it is required to only ever submit a single bio
(for an atomic write).

Furthermore, currently the atomic write unit min and max limit is fixed at
the FS block size.

For lifting the atomic write unit max limit, it may occur that an atomic
write spans mixed unwritten and mapped extents. For this case, due to the
iterative nature of iomap, multiple bios would be produced, which is
intolerable.

Add a function to zero unwritten extents in a certain range, which may be
used to ensure that unwritten extents are zeroed prior to issuing of an
atomic write.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c  | 76 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 ++
 2 files changed, 79 insertions(+)

Comments

Darrick J. Wong Dec. 11, 2024, 11:47 p.m. UTC | #1
On Tue, Dec 10, 2024 at 12:57:32PM +0000, John Garry wrote:
> For atomic writes support, it is required to only ever submit a single bio
> (for an atomic write).
> 
> Furthermore, currently the atomic write unit min and max limit is fixed at
> the FS block size.
> 
> For lifting the atomic write unit max limit, it may occur that an atomic
> write spans mixed unwritten and mapped extents. For this case, due to the
> iterative nature of iomap, multiple bios would be produced, which is
> intolerable.
> 
> Add a function to zero unwritten extents in a certain range, which may be
> used to ensure that unwritten extents are zeroed prior to issuing of an
> atomic write.

I still dislike this.  IMO block untorn writes _is_ a niche feature for
programs that perform IO in large blocks.  Any program that wants a
general "apply all these updates or none of them" interface should use
XFS_IOC_EXCHANGE_RANGE since it has no awu_max restrictions, can handle
discontiguous update ranges, doesn't require block alignment, etc.

Instead here we are adding a bunch of complexity, and not even all that
well:

> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c  | 76 +++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h |  3 ++
>  2 files changed, 79 insertions(+)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 23fdad16e6a8..18c888f0c11f 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -805,6 +805,82 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
>  
> +static loff_t
> +iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> +{
> +	const struct iomap *iomap = &iter->iomap;
> +	loff_t length = iomap_length(iter);
> +	loff_t pos = iter->pos;
> +
> +	if (iomap->type == IOMAP_UNWRITTEN) {
> +		int ret;
> +
> +		dio->flags |= IOMAP_DIO_UNWRITTEN;
> +		ret = iomap_dio_zero(iter, dio, pos, length);

Shouldn't this be detecting the particular case that the mapping for the
kiocb is in mixed state and only zeroing in that case?  This just
targets every unwritten extent, even if the unwritten extent covered the
entire range that is being written.  It doesn't handle COW, it doesn't
handle holes, etc.

Also, can you make a version of blkdev_issue_zeroout that returns the
bio so the caller can issue them asynchronously instead of opencoding
the bio_alloc loop in iomap_dev_zero?

> +		if (ret)
> +			return ret;
> +	}
> +
> +	dio->size += length;
> +
> +	return length;
> +}
> +
> +ssize_t
> +iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
> +		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct iomap_dio *dio;
> +	ssize_t ret;
> +	struct iomap_iter iomi = {
> +		.inode		= inode,
> +		.pos		= iocb->ki_pos,
> +		.len		= iov_iter_count(iter),
> +		.flags		= IOMAP_WRITE,

IOMAP_WRITE | IOMAP_DIRECT, no?

--D

> +	};
> +
> +	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
> +	if (!dio)
> +		return -ENOMEM;
> +
> +	dio->iocb = iocb;
> +	atomic_set(&dio->ref, 1);
> +	dio->i_size = i_size_read(inode);
> +	dio->dops = dops;
> +	dio->submit.waiter = current;
> +	dio->wait_for_completion = true;
> +
> +	inode_dio_begin(inode);
> +
> +	while ((ret = iomap_iter(&iomi, ops)) > 0)
> +		iomi.processed = iomap_dio_zero_unwritten_iter(&iomi, dio);
> +
> +	if (ret < 0)
> +		iomap_dio_set_error(dio, ret);
> +
> +	if (!atomic_dec_and_test(&dio->ref)) {
> +		for (;;) {
> +			set_current_state(TASK_UNINTERRUPTIBLE);
> +			if (!READ_ONCE(dio->submit.waiter))
> +				break;
> +
> +			blk_io_schedule();
> +		}
> +		__set_current_state(TASK_RUNNING);
> +	}
> +
> +	if (dops && dops->end_io)
> +		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
> +
> +	kfree(dio);
> +
> +	inode_dio_end(file_inode(iocb->ki_filp));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iomap_dio_zero_unwritten);
> +
>  static int __init iomap_dio_init(void)
>  {
>  	zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 5675af6b740c..c2d44b9e446d 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -440,6 +440,9 @@ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
>  		unsigned int dio_flags, void *private, size_t done_before);
> +ssize_t iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
> +		const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
> +
>  ssize_t iomap_dio_complete(struct iomap_dio *dio);
>  void iomap_dio_bio_end_io(struct bio *bio);
>  
> -- 
> 2.31.1
> 
>
John Garry Dec. 12, 2024, 10:40 a.m. UTC | #2
On 11/12/2024 23:47, Darrick J. Wong wrote:
> On Tue, Dec 10, 2024 at 12:57:32PM +0000, John Garry wrote:
>> For atomic writes support, it is required to only ever submit a single bio
>> (for an atomic write).
>>
>> Furthermore, currently the atomic write unit min and max limit is fixed at
>> the FS block size.
>>
>> For lifting the atomic write unit max limit, it may occur that an atomic
>> write spans mixed unwritten and mapped extents. For this case, due to the
>> iterative nature of iomap, multiple bios would be produced, which is
>> intolerable.
>>
>> Add a function to zero unwritten extents in a certain range, which may be
>> used to ensure that unwritten extents are zeroed prior to issuing of an
>> atomic write.
> 
> I still dislike this.  IMO block untorn writes _is_ a niche feature for
> programs that perform IO in large blocks.  Any program that wants a
> general "apply all these updates or none of them" interface should use
> XFS_IOC_EXCHANGE_RANGE since it has no awu_max restrictions, can handle
> discontiguous update ranges, doesn't require block alignment, etc.

That is not a problem which I am trying to solve. Indeed atomic writes 
are for fixed blocks of data and not atomic file updates.

However, I still think that we should be able to atomic write mixed 
extents, even though it is a pain to implement. To that end, I could be 
convinced again that we don't require it...

> 
> Instead here we are adding a bunch of complexity, and not even all that
> well:
> 
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/iomap/direct-io.c  | 76 +++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/iomap.h |  3 ++
>>   2 files changed, 79 insertions(+)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index 23fdad16e6a8..18c888f0c11f 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -805,6 +805,82 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>   }
>>   EXPORT_SYMBOL_GPL(iomap_dio_rw);
>>   
>> +static loff_t
>> +iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>> +{
>> +	const struct iomap *iomap = &iter->iomap;
>> +	loff_t length = iomap_length(iter);
>> +	loff_t pos = iter->pos;
>> +
>> +	if (iomap->type == IOMAP_UNWRITTEN) {
>> +		int ret;
>> +
>> +		dio->flags |= IOMAP_DIO_UNWRITTEN;
>> +		ret = iomap_dio_zero(iter, dio, pos, length);
> 
> Shouldn't this be detecting the particular case that the mapping for the
> kiocb is in mixed state and only zeroing in that case?  This just
> targets every unwritten extent, even if the unwritten extent covered the
> entire range that is being written. 

Right, so I did touch on this in the final comment in patch 4/7 commit log.

Why I did it this way? I did not think that it made much difference, 
since this zeroing would be generally a one-off and did not merit even 
more complexity to implement.

> It doesn't handle COW, it doesn't

Do we want to atomic write COW?

> handle holes, etc.

I did test hole, and it seemed to work. However I noticed that for a 
hole region we get IOMAP_UNWRITTEN type, like:

# xfs_bmap -vvp /root/mnt/file
/root/mnt/file:
  EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
    0: [0..127]:        192..319          0 (192..319)         128 000000
    1: [128..255]:      hole                                   128
    2: [256..383]:      448..575          0 (448..575)         128 000000
  FLAG Values:
     0100000 Shared extent
     0010000 Unwritten preallocated extent
     0001000 Doesn't begin on stripe unit
     0000100 Doesn't end   on stripe unit
     0000010 Doesn't begin on stripe width
     0000001 Doesn't end   on stripe width
#
#

For an atomic wrote of length 65536 @ offset 65536.

Any hint on how to create a iomap->type == IOMAP_HOLE?

> 
> Also, can you make a version of blkdev_issue_zeroout that returns the
> bio so the caller can issue them asynchronously instead of opencoding
> the bio_alloc loop in iomap_dev_zero?

ok, fine

> 
>> +		if (ret)
>> +			return ret;
>> +	}
>> +
>> +	dio->size += length;
>> +
>> +	return length;
>> +}
>> +
>> +ssize_t
>> +iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
>> +		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	struct iomap_dio *dio;
>> +	ssize_t ret;
>> +	struct iomap_iter iomi = {
>> +		.inode		= inode,
>> +		.pos		= iocb->ki_pos,
>> +		.len		= iov_iter_count(iter),
>> +		.flags		= IOMAP_WRITE,
> 
> IOMAP_WRITE | IOMAP_DIRECT, no?

yes, I'll fix that.

And I should also set IOMAP_DIO_WRITE for dio->flags.

Thanks,
John
Darrick J. Wong Dec. 12, 2024, 8:40 p.m. UTC | #3
On Thu, Dec 12, 2024 at 10:40:15AM +0000, John Garry wrote:
> On 11/12/2024 23:47, Darrick J. Wong wrote:
> > On Tue, Dec 10, 2024 at 12:57:32PM +0000, John Garry wrote:
> > > For atomic writes support, it is required to only ever submit a single bio
> > > (for an atomic write).
> > > 
> > > Furthermore, currently the atomic write unit min and max limit is fixed at
> > > the FS block size.
> > > 
> > > For lifting the atomic write unit max limit, it may occur that an atomic
> > > write spans mixed unwritten and mapped extents. For this case, due to the
> > > iterative nature of iomap, multiple bios would be produced, which is
> > > intolerable.
> > > 
> > > Add a function to zero unwritten extents in a certain range, which may be
> > > used to ensure that unwritten extents are zeroed prior to issuing of an
> > > atomic write.
> > 
> > I still dislike this.  IMO block untorn writes _is_ a niche feature for
> > programs that perform IO in large blocks.  Any program that wants a
> > general "apply all these updates or none of them" interface should use
> > XFS_IOC_EXCHANGE_RANGE since it has no awu_max restrictions, can handle
> > discontiguous update ranges, doesn't require block alignment, etc.
> 
> That is not a problem which I am trying to solve. Indeed atomic writes are
> for fixed blocks of data and not atomic file updates.

Agreed.

> However, I still think that we should be able to atomic write mixed extents,
> even though it is a pain to implement. To that end, I could be convinced
> again that we don't require it...

Well... if you /did/ add a few entries to include/uapi/linux/fs.h for
ways that an untorn write can fail, then we could define the programming
interface as so:

"If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
to force all the mappings to pure overwrites."

...since there have been a few people who have asked about that ability
so that they can write+fdatasync without so much overhead from file
metadata updates.

> > 
> > Instead here we are adding a bunch of complexity, and not even all that
> > well:
> > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/iomap/direct-io.c  | 76 +++++++++++++++++++++++++++++++++++++++++++
> > >   include/linux/iomap.h |  3 ++
> > >   2 files changed, 79 insertions(+)
> > > 
> > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > > index 23fdad16e6a8..18c888f0c11f 100644
> > > --- a/fs/iomap/direct-io.c
> > > +++ b/fs/iomap/direct-io.c
> > > @@ -805,6 +805,82 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > >   }
> > >   EXPORT_SYMBOL_GPL(iomap_dio_rw);
> > > +static loff_t
> > > +iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> > > +{
> > > +	const struct iomap *iomap = &iter->iomap;
> > > +	loff_t length = iomap_length(iter);
> > > +	loff_t pos = iter->pos;
> > > +
> > > +	if (iomap->type == IOMAP_UNWRITTEN) {
> > > +		int ret;
> > > +
> > > +		dio->flags |= IOMAP_DIO_UNWRITTEN;
> > > +		ret = iomap_dio_zero(iter, dio, pos, length);
> > 
> > Shouldn't this be detecting the particular case that the mapping for the
> > kiocb is in mixed state and only zeroing in that case?  This just
> > targets every unwritten extent, even if the unwritten extent covered the
> > entire range that is being written.
> 
> Right, so I did touch on this in the final comment in patch 4/7 commit log.
> 
> Why I did it this way? I did not think that it made much difference, since
> this zeroing would be generally a one-off and did not merit even more
> complexity to implement.

The trouble is, if you fallocate the whole file and then write an
aligned 64k block, this will write zeroes to the block, update the
mapping, and only then issue the untorn write.  Sure that's a one time
performance hit, but probably not a welcome one.

> > It doesn't handle COW, it doesn't
> 
> Do we want to atomic write COW?

I don't see why not -- if there's a single COW mapping for the whole
untorn write, then the data gets written to the media in an untorn
fashion, and the remap is a single transaction.

> > handle holes, etc.
> 
> I did test hole, and it seemed to work. However I noticed that for a hole
> region we get IOMAP_UNWRITTEN type, like:

Oh right, that's ->iomap_begin allocating an unwritten extent into the
hole, because you're not allowed to specify a hole for the destination
of a write.  I withdraw that part of the comment.

> # xfs_bmap -vvp /root/mnt/file
> /root/mnt/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
>    0: [0..127]:        192..319          0 (192..319)         128 000000
>    1: [128..255]:      hole                                   128
>    2: [256..383]:      448..575          0 (448..575)         128 000000
>  FLAG Values:
>     0100000 Shared extent
>     0010000 Unwritten preallocated extent
>     0001000 Doesn't begin on stripe unit
>     0000100 Doesn't end   on stripe unit
>     0000010 Doesn't begin on stripe width
>     0000001 Doesn't end   on stripe width
> #
> #
> 
> For an atomic wrote of length 65536 @ offset 65536.
> 
> Any hint on how to create a iomap->type == IOMAP_HOLE?
> 
> > Also, can you make a version of blkdev_issue_zeroout that returns the
> > bio so the caller can issue them asynchronously instead of opencoding
> > the bio_alloc loop in iomap_dev_zero?
> 
> ok, fine
> 
> > 
> > > +		if (ret)
> > > +			return ret;
> > > +	}
> > > +
> > > +	dio->size += length;
> > > +
> > > +	return length;
> > > +}
> > > +
> > > +ssize_t
> > > +iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
> > > +		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
> > > +{
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +	struct iomap_dio *dio;
> > > +	ssize_t ret;
> > > +	struct iomap_iter iomi = {
> > > +		.inode		= inode,
> > > +		.pos		= iocb->ki_pos,
> > > +		.len		= iov_iter_count(iter),
> > > +		.flags		= IOMAP_WRITE,
> > 
> > IOMAP_WRITE | IOMAP_DIRECT, no?
> 
> yes, I'll fix that.
> 
> And I should also set IOMAP_DIO_WRITE for dio->flags.

<nod>

--D

> Thanks,
> John
>
John Garry Dec. 13, 2024, 10:43 a.m. UTC | #4
>> However, I still think that we should be able to atomic write mixed extents,
>> even though it is a pain to implement. To that end, I could be convinced
>> again that we don't require it...
> 
> Well... if you /did/ add a few entries to include/uapi/linux/fs.h for
> ways that an untorn write can fail, then we could define the programming
> interface as so:
> 
> "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
> to force all the mappings to pure overwrites."
> 
> ...since there have been a few people who have asked about that ability
> so that they can write+fdatasync without so much overhead from file
> metadata updates.

ok, I see.

All this does seem more complicated in terms of implementation and user 
experience than what I have in this series. But if you think that there 
is value in FALLOC_FL_MAKE_OVERWRITE for other scenarios, then maybe it 
could be good, though.

> 
>>>
>>> Instead here we are adding a bunch of complexity, and not even all that
>>> well:
>>>
>>>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>>>> ---
>>>>    fs/iomap/direct-io.c  | 76 +++++++++++++++++++++++++++++++++++++++++++
>>>>    include/linux/iomap.h |  3 ++
>>>>    2 files changed, 79 insertions(+)
>>>>
>>>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>>>> index 23fdad16e6a8..18c888f0c11f 100644
>>>> --- a/fs/iomap/direct-io.c
>>>> +++ b/fs/iomap/direct-io.c
>>>> @@ -805,6 +805,82 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(iomap_dio_rw);
>>>> +static loff_t
>>>> +iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>>>> +{
>>>> +	const struct iomap *iomap = &iter->iomap;
>>>> +	loff_t length = iomap_length(iter);
>>>> +	loff_t pos = iter->pos;
>>>> +
>>>> +	if (iomap->type == IOMAP_UNWRITTEN) {
>>>> +		int ret;
>>>> +
>>>> +		dio->flags |= IOMAP_DIO_UNWRITTEN;
>>>> +		ret = iomap_dio_zero(iter, dio, pos, length);
>>>
>>> Shouldn't this be detecting the particular case that the mapping for the
>>> kiocb is in mixed state and only zeroing in that case?  This just
>>> targets every unwritten extent, even if the unwritten extent covered the
>>> entire range that is being written.
>>
>> Right, so I did touch on this in the final comment in patch 4/7 commit log.
>>
>> Why I did it this way? I did not think that it made much difference, since
>> this zeroing would be generally a one-off and did not merit even more
>> complexity to implement.
> 
> The trouble is, if you fallocate the whole file and then write an
> aligned 64k block, this will write zeroes to the block, update the
> mapping, and only then issue the untorn write.  Sure that's a one time
> performance hit, but probably not a welcome one.

ok, I can try to improve on this. It might get considerably more 
complicated...

> 
>>> It doesn't handle COW, it doesn't
>>
>> Do we want to atomic write COW?
> 
> I don't see why not -- if there's a single COW mapping for the whole
> untorn write, then the data gets written to the media in an untorn
> fashion, and the remap is a single transaction.

I tested atomic write on COW and it works ok, but the behavior is odd to me.

If I attempt to atomic write a single block in shared extent, then we 
have this callchain: xfs_file_dio_write_atomic() -> 
iomap_dio_rw(IOMAP_DIO_OVERWRITE_ONLY) -> ... 
xfs_direct_write_iomap_begin() -> xfs_reflink_allocate_cow() and we 
alloc a new extent.

And so xfs_file_dio_write_atomic() -> 
iomap_dio_rw(IOMAP_DIO_OVERWRITE_ONLY) does not return -EAGAIN and we 
don't even attempt to zero.

I just wonder why IOMAP_DIO_OVERWRITE_ONLY is not honoured here, as 
xfs_reflink_allocate_cow() -> xfs_reflink_fill_cow_hole() -> 
xfs_bmap_write() can alloc new blocks.

I am not too concerned about atomic writing mixed extents which includes 
COW extents, as atomic writing mixed extents is based on "big alloc" and 
that does not enable reflink (yet).

> 
>>> handle holes, etc.
>>
>> I did test hole, and it seemed to work. However I noticed that for a hole
>> region we get IOMAP_UNWRITTEN type, like:
> 
> Oh right, that's ->iomap_begin allocating an unwritten extent into the
> hole, because you're not allowed to specify a hole for the destination
> of a write.  I withdraw that part of the comment.


> 

Thanks,
John
Christoph Hellwig Dec. 13, 2024, 2:47 p.m. UTC | #5
On Thu, Dec 12, 2024 at 12:40:07PM -0800, Darrick J. Wong wrote:
> > However, I still think that we should be able to atomic write mixed extents,
> > even though it is a pain to implement. To that end, I could be convinced
> > again that we don't require it...
> 
> Well... if you /did/ add a few entries to include/uapi/linux/fs.h for
> ways that an untorn write can fail, then we could define the programming
> interface as so:
> 
> "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
> to force all the mappings to pure overwrites."

Ewwwwwwwwwwwwwwwwwwwww.

That's not a sane API in any way.

> ...since there have been a few people who have asked about that ability
> so that they can write+fdatasync without so much overhead from file
> metadata updates.

And all of them fundamentally misunderstood file system semantics and/or
used weird bypasses that are dommed to corrupt the file system sooner
or later.
Darrick J. Wong Dec. 14, 2024, 12:56 a.m. UTC | #6
On Fri, Dec 13, 2024 at 03:47:40PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 12:40:07PM -0800, Darrick J. Wong wrote:
> > > However, I still think that we should be able to atomic write mixed extents,
> > > even though it is a pain to implement. To that end, I could be convinced
> > > again that we don't require it...
> > 
> > Well... if you /did/ add a few entries to include/uapi/linux/fs.h for
> > ways that an untorn write can fail, then we could define the programming
> > interface as so:
> > 
> > "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
> > to force all the mappings to pure overwrites."
> 
> Ewwwwwwwwwwwwwwwwwwwww.
> 
> That's not a sane API in any way.

Oh I know, I'd much rather stick to the view that block untorn writes
are a means for programs that only ever do IO in large(ish) blocks to
take advantage of a hardware feature that also wants those large
blocks.  And only if the file mapping is in the correct state, and the
program is willing to *maintain* them in the correct state to get the
better performance.  I don't want xfs to grow code to write zeroes to
mapped blocks just so it can then write-untorn to the same blocks.

The gross part is that I think if you want to do untorn multi-fsblock
writes, then you need forcealign.  In turn, forcealign has to handle COW
of shared blocks.  willy and I looked through the changes I made to
support dirtying and writing back gangs of pages for rtreflink when the
rtextsize > 1, and didn't find anything insane in there.  Using that to
handle COWing forcealign file blocks should work, though things get
tricky once you add atomic untorn writes because we can't split bios.

Everything else I think should use exchange-range because it has so many
fewer limitations.

--D

> > ...since there have been a few people who have asked about that ability
> > so that they can write+fdatasync without so much overhead from file
> > metadata updates.
> 
> And all of them fundamentally misunderstood file system semantics and/or
> used weird bypasses that are dommed to corrupt the file system sooner
> or later.
Christoph Hellwig Dec. 17, 2024, 7:08 a.m. UTC | #7
On Fri, Dec 13, 2024 at 04:56:38PM -0800, Darrick J. Wong wrote:
> > > "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
> > > to force all the mappings to pure overwrites."
> > 
> > Ewwwwwwwwwwwwwwwwwwwww.
> > 
> > That's not a sane API in any way.
> 
> Oh I know, I'd much rather stick to the view that block untorn writes
> are a means for programs that only ever do IO in large(ish) blocks to
> take advantage of a hardware feature that also wants those large
> blocks.

I (vaguely) agree ith that.

> And only if the file mapping is in the correct state, and the
> program is willing to *maintain* them in the correct state to get the
> better performance.

I kinda agree with that, but the maintain is a bit hard as general
rule of thumb as file mappings can change behind the applications
back.  So building interfaces around the concept that there are
entirely stable mappings seems like a bad idea.

> I don't want xfs to grow code to write zeroes to
> mapped blocks just so it can then write-untorn to the same blocks.

Agreed.
John Garry Dec. 18, 2024, 11:15 a.m. UTC | #8
On 17/12/2024 07:08, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 04:56:38PM -0800, Darrick J. Wong wrote:
>>>> "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
>>>> to force all the mappings to pure overwrites."
>>>
>>> Ewwwwwwwwwwwwwwwwwwwww.
>>>
>>> That's not a sane API in any way.
>>
>> Oh I know, I'd much rather stick to the view that block untorn writes
>> are a means for programs that only ever do IO in large(ish) blocks to
>> take advantage of a hardware feature that also wants those large
>> blocks.
> 
> I (vaguely) agree ith that.
> 
>> And only if the file mapping is in the correct state, and the
>> program is willing to *maintain* them in the correct state to get the
>> better performance.
> 
> I kinda agree with that, but the maintain is a bit hard as general
> rule of thumb as file mappings can change behind the applications
> back.  So building interfaces around the concept that there are
> entirely stable mappings seems like a bad idea.

I tend to agree.

> 
>> I don't want xfs to grow code to write zeroes to
>> mapped blocks just so it can then write-untorn to the same blocks.
> 
> Agreed.
> 

So if we want to allow large writes over mixed extents, how to handle?

Note that some time ago we also discussed that we don't want to have a 
single bio covering mixed extents as we cannot atomically convert all 
unwritten extents to mapped.

Thanks,
John
Darrick J. Wong Jan. 8, 2025, 1:26 a.m. UTC | #9
On Wed, Dec 18, 2024 at 11:15:42AM +0000, John Garry wrote:
> On 17/12/2024 07:08, Christoph Hellwig wrote:
> > On Fri, Dec 13, 2024 at 04:56:38PM -0800, Darrick J. Wong wrote:
> > > > > "If you receive -EBADMAP, then call fallocate(FALLOC_FL_MAKE_OVERWRITE)
> > > > > to force all the mappings to pure overwrites."
> > > > 
> > > > Ewwwwwwwwwwwwwwwwwwwww.
> > > > 
> > > > That's not a sane API in any way.
> > > 
> > > Oh I know, I'd much rather stick to the view that block untorn writes
> > > are a means for programs that only ever do IO in large(ish) blocks to
> > > take advantage of a hardware feature that also wants those large
> > > blocks.
> > 
> > I (vaguely) agree ith that.
> > 
> > > And only if the file mapping is in the correct state, and the
> > > program is willing to *maintain* them in the correct state to get the
> > > better performance.
> > 
> > I kinda agree with that, but the maintain is a bit hard as general
> > rule of thumb as file mappings can change behind the applications
> > back.  So building interfaces around the concept that there are
> > entirely stable mappings seems like a bad idea.
> 
> I tend to agree.

As long as it's a general rule that file mappings can change even after
whatever prep work an application tries to do, we're never going to have
an easy time enabling any of these fancy direct-to-storage tricks like
cpu loads and stores to pmem, or this block-untorn writes stuff.

> > 
> > > I don't want xfs to grow code to write zeroes to
> > > mapped blocks just so it can then write-untorn to the same blocks.
> > 
> > Agreed.
> > 
> 
> So if we want to allow large writes over mixed extents, how to handle?
> 
> Note that some time ago we also discussed that we don't want to have a
> single bio covering mixed extents as we cannot atomically convert all
> unwritten extents to mapped.

From https://lore.kernel.org/linux-xfs/Z3wbqlfoZjisbe1x@infradead.org/ :

"I think we should wire it up as a new FALLOC_FL_WRITE_ZEROES mode,
document very vigorously that it exists to facilitate pure overwrites
(specifically that it returns EOPNOTSUPP for always-cow files), and not
add more ioctls."

If we added this new fallocate mode to set up written mappings, would it
be enough to write in the programming manuals that applications should
use it to prepare a file for block-untorn writes?  Perhaps we should
change the errno code to EMEDIUMTYPE for the mixed mappings case.

Alternately, maybe we /should/ let programs open a lease-fd on a file
range, do their untorn writes through the lease fd, and if another
thread does something to break the lease, then the lease fd returns EIO
until you close it.

<shrug> any thoughts?

--D

> Thanks,
> John
> 
> 
> 
>
John Garry Jan. 8, 2025, 11:39 a.m. UTC | #10
On 08/01/2025 01:26, Darrick J. Wong wrote:
>>> I (vaguely) agree ith that.
>>>
>>>> And only if the file mapping is in the correct state, and the
>>>> program is willing to*maintain* them in the correct state to get the
>>>> better performance.
>>> I kinda agree with that, but the maintain is a bit hard as general
>>> rule of thumb as file mappings can change behind the applications
>>> back.  So building interfaces around the concept that there are
>>> entirely stable mappings seems like a bad idea.
>> I tend to agree.
> As long as it's a general rule that file mappings can change even after
> whatever prep work an application tries to do, we're never going to have
> an easy time enabling any of these fancy direct-to-storage tricks like
> cpu loads and stores to pmem, or this block-untorn writes stuff.
> 
>>>> I don't want xfs to grow code to write zeroes to
>>>> mapped blocks just so it can then write-untorn to the same blocks.
>>> Agreed.

Any other ideas on how to achieve this then?

There was the proposal to create a single bio covering mixed mappings, 
but then we had the issue that all the mappings cannot be atomically 
converted. I am not sure if this is really such an issue. I know that 
RWF_ATOMIC means all or nothing, but partially converted extents (from 
an atomic write) is a sort of grey area, as the original unmapped 
extents had nothing in the first place.

>>>
>> So if we want to allow large writes over mixed extents, how to handle?
>>
>> Note that some time ago we also discussed that we don't want to have a
>> single bio covering mixed extents as we cannot atomically convert all
>> unwritten extents to mapped.
> Fromhttps://lore.kernel.org/linux-xfs/Z3wbqlfoZjisbe1x@infradead.org/ :
> 
> "I think we should wire it up as a new FALLOC_FL_WRITE_ZEROES mode,
> document very vigorously that it exists to facilitate pure overwrites
> (specifically that it returns EOPNOTSUPP for always-cow files), and not
> add more ioctls."
> 
> If we added this new fallocate mode to set up written mappings, would it
> be enough to write in the programming manuals that applications should
> use it to prepare a file for block-untorn writes? 

Sure, that API extension could be useful in the case that we conclude 
that we don't permit atomic writes over mixed mappings.

> Perhaps we should
> change the errno code to EMEDIUMTYPE for the mixed mappings case.
> 
> Alternately, maybe we/should/ let programs open a lease-fd on a file
> range, do their untorn writes through the lease fd, and if another
> thread does something to break the lease, then the lease fd returns EIO
> until you close it.

So do means applications own specific ranges in files for exclusive 
atomic writes? Wouldn't that break what we already support today?

Cheers,
John
Darrick J. Wong Jan. 8, 2025, 5:42 p.m. UTC | #11
On Wed, Jan 08, 2025 at 11:39:35AM +0000, John Garry wrote:
> On 08/01/2025 01:26, Darrick J. Wong wrote:
> > > > I (vaguely) agree ith that.
> > > > 
> > > > > And only if the file mapping is in the correct state, and the
> > > > > program is willing to*maintain* them in the correct state to get the
> > > > > better performance.
> > > > I kinda agree with that, but the maintain is a bit hard as general
> > > > rule of thumb as file mappings can change behind the applications
> > > > back.  So building interfaces around the concept that there are
> > > > entirely stable mappings seems like a bad idea.
> > > I tend to agree.
> > As long as it's a general rule that file mappings can change even after
> > whatever prep work an application tries to do, we're never going to have
> > an easy time enabling any of these fancy direct-to-storage tricks like
> > cpu loads and stores to pmem, or this block-untorn writes stuff.
> > 
> > > > > I don't want xfs to grow code to write zeroes to
> > > > > mapped blocks just so it can then write-untorn to the same blocks.
> > > > Agreed.
> 
> Any other ideas on how to achieve this then?
> 
> There was the proposal to create a single bio covering mixed mappings, but
> then we had the issue that all the mappings cannot be atomically converted.
> I am not sure if this is really such an issue. I know that RWF_ATOMIC means
> all or nothing, but partially converted extents (from an atomic write) is a
> sort of grey area, as the original unmapped extents had nothing in the first
> place.

The long way -- introducing a file remap log intent item to guarantee
that the ioend processing completes no matter how mixed the mapping
might be.

> > > > 
> > > So if we want to allow large writes over mixed extents, how to handle?
> > > 
> > > Note that some time ago we also discussed that we don't want to have a
> > > single bio covering mixed extents as we cannot atomically convert all
> > > unwritten extents to mapped.
> > Fromhttps://lore.kernel.org/linux-xfs/Z3wbqlfoZjisbe1x@infradead.org/ :
> > 
> > "I think we should wire it up as a new FALLOC_FL_WRITE_ZEROES mode,
> > document very vigorously that it exists to facilitate pure overwrites
> > (specifically that it returns EOPNOTSUPP for always-cow files), and not
> > add more ioctls."
> > 
> > If we added this new fallocate mode to set up written mappings, would it
> > be enough to write in the programming manuals that applications should
> > use it to prepare a file for block-untorn writes?
> 
> Sure, that API extension could be useful in the case that we conclude that
> we don't permit atomic writes over mixed mappings.
> 
> > Perhaps we should
> > change the errno code to EMEDIUMTYPE for the mixed mappings case.
> > 
> > Alternately, maybe we/should/ let programs open a lease-fd on a file
> > range, do their untorn writes through the lease fd, and if another
> > thread does something to break the lease, then the lease fd returns EIO
> > until you close it.
> 
> So do means applications own specific ranges in files for exclusive atomic
> writes? Wouldn't that break what we already support today?

The application would own a lease on a specific range, but it could pass
that fd around.  Also you wouldn't need a lease for a single-fsblock
untorn write.

--D

> Cheers,
> John
> 
>
Christoph Hellwig Jan. 9, 2025, 7:54 a.m. UTC | #12
On Tue, Jan 07, 2025 at 05:26:36PM -0800, Darrick J. Wong wrote:
> "I think we should wire it up as a new FALLOC_FL_WRITE_ZEROES mode,
> document very vigorously that it exists to facilitate pure overwrites
> (specifically that it returns EOPNOTSUPP for always-cow files), and not
> add more ioctls."
> 
> If we added this new fallocate mode to set up written mappings, would it
> be enough to write in the programming manuals that applications should
> use it to prepare a file for block-untorn writes?  Perhaps we should
> change the errno code to EMEDIUMTYPE for the mixed mappings case.
> 
> Alternately, maybe we /should/ let programs open a lease-fd on a file
> range, do their untorn writes through the lease fd, and if another
> thread does something to break the lease, then the lease fd returns EIO
> until you close it.

This still violates the "no unexpected errors" paradigm.  The whole
FALLOC_FL_WRITE_ZEROES (I hate that name btw) model would only work
if we had a software fallback that make the operations slower but
still work in case of an unexpected change to the extent mapping.
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 23fdad16e6a8..18c888f0c11f 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -805,6 +805,82 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
 
+static loff_t
+iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
+{
+	const struct iomap *iomap = &iter->iomap;
+	loff_t length = iomap_length(iter);
+	loff_t pos = iter->pos;
+
+	if (iomap->type == IOMAP_UNWRITTEN) {
+		int ret;
+
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		ret = iomap_dio_zero(iter, dio, pos, length);
+		if (ret)
+			return ret;
+	}
+
+	dio->size += length;
+
+	return length;
+}
+
+ssize_t
+iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct iomap_dio *dio;
+	ssize_t ret;
+	struct iomap_iter iomi = {
+		.inode		= inode,
+		.pos		= iocb->ki_pos,
+		.len		= iov_iter_count(iter),
+		.flags		= IOMAP_WRITE,
+	};
+
+	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->i_size = i_size_read(inode);
+	dio->dops = dops;
+	dio->submit.waiter = current;
+	dio->wait_for_completion = true;
+
+	inode_dio_begin(inode);
+
+	while ((ret = iomap_iter(&iomi, ops)) > 0)
+		iomi.processed = iomap_dio_zero_unwritten_iter(&iomi, dio);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	if (!atomic_dec_and_test(&dio->ref)) {
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			blk_io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (dops && dops->end_io)
+		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
+
+	kfree(dio);
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_zero_unwritten);
+
 static int __init iomap_dio_init(void)
 {
 	zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c..c2d44b9e446d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -440,6 +440,9 @@  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before);
+ssize_t iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
+
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
 void iomap_dio_bio_end_io(struct bio *bio);