diff mbox series

[v3,17/21] iomap: Atomic write support

Message ID 20240429174746.2132161-18-john.g.garry@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry April 29, 2024, 5:47 p.m. UTC
Support atomic writes by producing a single BIO with REQ_ATOMIC flag set.

We rely on the FS to guarantee extent alignment, such that an atomic write
should never straddle two or more extents. The FS should also check for
validity of an atomic write length/alignment.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

Comments

Dave Chinner May 1, 2024, 1:47 a.m. UTC | #1
On Mon, Apr 29, 2024 at 05:47:42PM +0000, John Garry wrote:
> Support atomic writes by producing a single BIO with REQ_ATOMIC flag set.
> 
> We rely on the FS to guarantee extent alignment, such that an atomic write
> should never straddle two or more extents. The FS should also check for
> validity of an atomic write length/alignment.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index a3ed7cfa95bc..d7bdeb675068 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -275,6 +275,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		struct iomap_dio *dio)
>  {
> +	bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
>  	unsigned int zeroing_size, pad;
> @@ -387,6 +388,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  		bio->bi_write_hint = inode->i_write_hint;
>  		bio->bi_ioprio = dio->iocb->ki_ioprio;
> +		if (is_atomic)
> +			bio->bi_opf |= REQ_ATOMIC;

REQ_ATOMIC is only valid for write IO, isn't it?

This should be added in iomap_dio_bio_opflags() after it is
determined we are doing a write operation.  Regardless, it should be
added in iomap_dio_bio_opflags(), not here. That also allows us to
get rid of the is_atomic variable.

> +
>  		bio->bi_private = dio;
>  		bio->bi_end_io = iomap_dio_bio_end_io;
>  
> @@ -403,6 +407,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		}
>  
>  		n = bio->bi_iter.bi_size;
> +		if (is_atomic && n != orig_count) {
> +			/* This bio should have covered the complete length */
> +			ret = -EINVAL;
> +			bio_put(bio);
> +			goto out;
> +		}

What happens now if we've done zeroing IO before this? I suspect we
might expose stale data if the partial block zeroing converts the
unwritten extent in full...

>  		if (dio->flags & IOMAP_DIO_WRITE) {
>  			task_io_account_write(n);
>  		} else {

Ignoring the error handling issues, this code might be better as:

		if (dio->flags & IOMAP_DIO_WRITE) {
			if ((opflags & REQ_ATOMIC) && n != orig_count) {
				/* atomic writes are all or nothing */
				ret = -EIO
				bio_put(bio);
				goto out;
			}
		}

so that we are not putting atomic write error checks in the read IO
submission path.

-Dave.
John Garry May 1, 2024, 11:08 a.m. UTC | #2
On 01/05/2024 02:47, Dave Chinner wrote:
> On Mon, Apr 29, 2024 at 05:47:42PM +0000, John Garry wrote:
>> Support atomic writes by producing a single BIO with REQ_ATOMIC flag set.
>>
>> We rely on the FS to guarantee extent alignment, such that an atomic write
>> should never straddle two or more extents. The FS should also check for
>> validity of an atomic write length/alignment.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/iomap/direct-io.c | 10 ++++++++++
>>   1 file changed, 10 insertions(+)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index a3ed7cfa95bc..d7bdeb675068 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -275,6 +275,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>>   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		struct iomap_dio *dio)
>>   {
>> +	bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>>   	unsigned int zeroing_size, pad;
>> @@ -387,6 +388,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>>   		bio->bi_write_hint = inode->i_write_hint;
>>   		bio->bi_ioprio = dio->iocb->ki_ioprio;
>> +		if (is_atomic)
>> +			bio->bi_opf |= REQ_ATOMIC;
> 
> REQ_ATOMIC is only valid for write IO, isn't it?

yes, it is. We reject RWF_ATOMIC for a READ.

> 
> This should be added in iomap_dio_bio_opflags() after it is
> determined we are doing a write operation.  Regardless, it should be
> added in iomap_dio_bio_opflags(), not here. That also allows us to
> get rid of the is_atomic variable.

ok

> 
>> +
>>   		bio->bi_private = dio;
>>   		bio->bi_end_io = iomap_dio_bio_end_io;
>>   
>> @@ -403,6 +407,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		}
>>   
>>   		n = bio->bi_iter.bi_size;
>> +		if (is_atomic && n != orig_count) {
>> +			/* This bio should have covered the complete length */
>> +			ret = -EINVAL;
>> +			bio_put(bio);
>> +			goto out;
>> +		}
> 
> What happens now if we've done zeroing IO before this? I suspect we
> might expose stale data if the partial block zeroing converts the
> unwritten extent in full...

We use iomap_dio.ref to ensure that __iomap_dio_rw() does not return 
until any zeroing and actual sub-io block write completes. See 
iomap_dio_zero() -> iomap_dio_submit_bio() -> atomic_inc(&dio->ref) 
callchain. I meant to add such info to the commit message, as you 
questioned this previously.

> 
>>   		if (dio->flags & IOMAP_DIO_WRITE) {
>>   			task_io_account_write(n);
>>   		} else {
> 
> Ignoring the error handling issues, this code might be better as:
> 
> 		if (dio->flags & IOMAP_DIO_WRITE) {
> 			if ((opflags & REQ_ATOMIC) && n != orig_count) {
> 				/* atomic writes are all or nothing */
> 				ret = -EIO
> 				bio_put(bio);
> 				goto out;
> 			}
> 		}
> 
> so that we are not putting atomic write error checks in the read IO
> submission path.
> 

Maybe, I'll look at a rework with the suggested change to use 
iomap_dio_bio_opflags() - I actually thought that I introduced a change 
to use iomap_dio_bio_opflags() previously...

BTW, we need to return -EINVAL, as this is what userspace expects for 
such an error.

Thanks,
John
Dave Chinner May 2, 2024, 1:43 a.m. UTC | #3
On Wed, May 01, 2024 at 12:08:34PM +0100, John Garry wrote:
> On 01/05/2024 02:47, Dave Chinner wrote:
> > On Mon, Apr 29, 2024 at 05:47:42PM +0000, John Garry wrote:
> > > Support atomic writes by producing a single BIO with REQ_ATOMIC flag set.
> > > 
> > > We rely on the FS to guarantee extent alignment, such that an atomic write
> > > should never straddle two or more extents. The FS should also check for
> > > validity of an atomic write length/alignment.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
...
> > > +
> > >   		bio->bi_private = dio;
> > >   		bio->bi_end_io = iomap_dio_bio_end_io;
> > > @@ -403,6 +407,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   		}
> > >   		n = bio->bi_iter.bi_size;
> > > +		if (is_atomic && n != orig_count) {
> > > +			/* This bio should have covered the complete length */
> > > +			ret = -EINVAL;
> > > +			bio_put(bio);
> > > +			goto out;
> > > +		}
> > 
> > What happens now if we've done zeroing IO before this? I suspect we
> > might expose stale data if the partial block zeroing converts the
> > unwritten extent in full...
> 
> We use iomap_dio.ref to ensure that __iomap_dio_rw() does not return until
> any zeroing and actual sub-io block write completes. See iomap_dio_zero() ->
> iomap_dio_submit_bio() -> atomic_inc(&dio->ref) callchain. I meant to add
> such info to the commit message, as you questioned this previously.

Yes, I get that. But my point is that we may have only done -part-
of a block unaligned IO.

This is effectively a failure from a bio_iov_iter_get_pages() call.
What does the bio_iov_iter_get_pages() failure case do that this new
failure case not do? Why does this case have different failure
handling?

-Dave.
John Garry May 2, 2024, 9:12 a.m. UTC | #4
On 02/05/2024 02:43, Dave Chinner wrote:
> On Wed, May 01, 2024 at 12:08:34PM +0100, John Garry wrote:
>> On 01/05/2024 02:47, Dave Chinner wrote:
>>> On Mon, Apr 29, 2024 at 05:47:42PM +0000, John Garry wrote:
>>>> Support atomic writes by producing a single BIO with REQ_ATOMIC flag set.
>>>>
>>>> We rely on the FS to guarantee extent alignment, such that an atomic write
>>>> should never straddle two or more extents. The FS should also check for
>>>> validity of an atomic write length/alignment.
>>>>
>>>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>>>> ---
> ...
>>>> +
>>>>    		bio->bi_private = dio;
>>>>    		bio->bi_end_io = iomap_dio_bio_end_io;
>>>> @@ -403,6 +407,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>>>    		}
>>>>    		n = bio->bi_iter.bi_size;
>>>> +		if (is_atomic && n != orig_count) {
>>>> +			/* This bio should have covered the complete length */
>>>> +			ret = -EINVAL;
>>>> +			bio_put(bio);
>>>> +			goto out;
>>>> +		}
>>>
>>> What happens now if we've done zeroing IO before this? I suspect we
>>> might expose stale data if the partial block zeroing converts the
>>> unwritten extent in full...
>>
>> We use iomap_dio.ref to ensure that __iomap_dio_rw() does not return until
>> any zeroing and actual sub-io block write completes. See iomap_dio_zero() ->
>> iomap_dio_submit_bio() -> atomic_inc(&dio->ref) callchain. I meant to add
>> such info to the commit message, as you questioned this previously.
> 
> Yes, I get that. But my point is that we may have only done -part-
> of a block unaligned IO.
> 
> This is effectively a failure from a bio_iov_iter_get_pages() call.
> What does the bio_iov_iter_get_pages() failure case do that this new
> failure case not do? Why does this case have different failure
> handling?
> 

So you are saying that if we fail here (that is the (is_atomic && n != 
orig_count) check), anything unwritten in the atomic write region and 
zerotail region could expose stale data, right?

If so, I would say that we need to zero the complete unwritten atomic 
write and zerotail regions - similar to the bio_iov_iter_get_pages() 
failure handling - and still report an -EINVAL error.

How does that sound?

Thanks,
John
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index a3ed7cfa95bc..d7bdeb675068 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -275,6 +275,7 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		struct iomap_dio *dio)
 {
+	bool is_atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int zeroing_size, pad;
@@ -387,6 +388,9 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = inode->i_write_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		if (is_atomic)
+			bio->bi_opf |= REQ_ATOMIC;
+
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -403,6 +407,12 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (is_atomic && n != orig_count) {
+			/* This bio should have covered the complete length */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto out;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {