diff mbox series

[1/6] fs: iomap: Atomic write support

Message ID 20240124142645.9334-2-john.g.garry@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series block atomic writes for XFS | expand

Commit Message

John Garry Jan. 24, 2024, 2:26 p.m. UTC
Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
bio is being created and all the rules there need to be followed.

It is the task of the FS iomap iter callbacks to ensure that the mapping
created adheres to those rules, like size is power-of-2, is at a
naturally-aligned offset, etc. However, checking for a single iovec, i.e.
iter type is ubuf, is done in __iomap_dio_rw().

A write should only produce a single bio, so error when it doesn't.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
 fs/iomap/trace.h      |  3 ++-
 include/linux/iomap.h |  1 +
 3 files changed, 23 insertions(+), 2 deletions(-)

Comments

Darrick J. Wong Feb. 2, 2024, 5:25 p.m. UTC | #1
On Wed, Jan 24, 2024 at 02:26:40PM +0000, John Garry wrote:
> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> bio is being created and all the rules there need to be followed.
> 
> It is the task of the FS iomap iter callbacks to ensure that the mapping
> created adheres to those rules, like size is power-of-2, is at a
> naturally-aligned offset, etc. However, checking for a single iovec, i.e.
> iter type is ubuf, is done in __iomap_dio_rw().
> 
> A write should only produce a single bio, so error when it doesn't.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
>  fs/iomap/trace.h      |  3 ++-
>  include/linux/iomap.h |  1 +
>  3 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index bcd3f8cf5ea4..25736d01b857 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -275,10 +275,12 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		struct iomap_dio *dio)
>  {
> +	bool atomic_write = iter->flags & IOMAP_ATOMIC;
>  	const struct iomap *iomap = &iter->iomap;
>  	struct inode *inode = iter->inode;
>  	unsigned int fs_block_size = i_blocksize(inode), pad;
>  	loff_t length = iomap_length(iter);
> +	const size_t iter_len = iter->len;
>  	loff_t pos = iter->pos;
>  	blk_opf_t bio_opf;
>  	struct bio *bio;
> @@ -381,6 +383,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  					  GFP_KERNEL);
>  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  		bio->bi_ioprio = dio->iocb->ki_ioprio;
> +		if (atomic_write)
> +			bio->bi_opf |= REQ_ATOMIC;

This really ought to be in iomap_dio_bio_opflags.  Unless you can't pass
REQ_ATOMIC to bio_alloc*, in which case there ought to be a comment
about why.

Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC?  Does that
actually work?  I don't know what that means, and "block: Add REQ_ATOMIC
flag" says that's not a valid combination.  I'll complain about this
more below.

> +
>  		bio->bi_private = dio;
>  		bio->bi_end_io = iomap_dio_bio_end_io;
>  
> @@ -397,6 +402,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		}
>  
>  		n = bio->bi_iter.bi_size;
> +		if (atomic_write && n != iter_len) {

s/iter_len/orig_len/ ?

> +			/* This bio should have covered the complete length */
> +			ret = -EINVAL;
> +			bio_put(bio);
> +			goto out;
> +		}
>  		if (dio->flags & IOMAP_DIO_WRITE) {
>  			task_io_account_write(n);
>  		} else {
> @@ -554,12 +565,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	struct blk_plug plug;
>  	struct iomap_dio *dio;
>  	loff_t ret = 0;
> +	bool is_read = iov_iter_rw(iter) == READ;
> +	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;

Hrmm.  So if the caller passes in an IOCB_ATOMIC iocb with a READ iter,
we'll silently drop IOCB_ATOMIC and do the read anyway?  That seems like
a nonsense combination, but is that ok for some reason?

>  	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>  
>  	if (!iomi.len)
>  		return NULL;
>  
> +	if (atomic_write && !iter_is_ubuf(iter))
> +		return ERR_PTR(-EINVAL);

Does !iter_is_ubuf actually happen?  Why don't we support any of the
other ITER_ types?  Is it because hardware doesn't want vectored
buffers?

I really wish there was more commenting on /why/ we do things here:

	if (iocb->ki_flags & IOCB_ATOMIC) {
		/* atomic reads do not make sense */
		if (iov_iter_rw(iter) == READ)
			return ERR_PTR(-EINVAL);

		/*
		 * block layer doesn't want to handle handle vectors of
		 * buffers when performing an atomic write i guess?
		 */
		if (!iter_is_ubuf(iter))
			return ERR_PTR(-EINVAL);

		iomi.flags |= IOMAP_ATOMIC;
	}

> +
>  	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
>  	if (!dio)
>  		return ERR_PTR(-ENOMEM);
> @@ -579,7 +595,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (iocb->ki_flags & IOCB_NOWAIT)
>  		iomi.flags |= IOMAP_NOWAIT;
>  
> -	if (iov_iter_rw(iter) == READ) {
> +	if (is_read) {
>  		/* reads can always complete inline */
>  		dio->flags |= IOMAP_DIO_INLINE_COMP;
>  
> @@ -605,6 +621,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
>  			dio->flags |= IOMAP_DIO_CALLER_COMP;
>  
> +		if (atomic_write)
> +			iomi.flags |= IOMAP_ATOMIC;
> +
>  		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
>  			ret = -EAGAIN;
>  			if (iomi.pos >= dio->i_size ||
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index c16fd55f5595..c95576420bca 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>  	{ IOMAP_REPORT,		"REPORT" }, \
>  	{ IOMAP_FAULT,		"FAULT" }, \
>  	{ IOMAP_DIRECT,		"DIRECT" }, \
> -	{ IOMAP_NOWAIT,		"NOWAIT" }
> +	{ IOMAP_NOWAIT,		"NOWAIT" }, \
> +	{ IOMAP_ATOMIC,		"ATOMIC" }
>  
>  #define IOMAP_F_FLAGS_STRINGS \
>  	{ IOMAP_F_NEW,		"NEW" }, \
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 96dd0acbba44..9eac704a0d6f 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -178,6 +178,7 @@ struct iomap_folio_ops {
>  #else
>  #define IOMAP_DAX		0
>  #endif /* CONFIG_FS_DAX */
> +#define IOMAP_ATOMIC		(1 << 9)
>  
>  struct iomap_ops {
>  	/*
> -- 
> 2.31.1
> 
>
John Garry Feb. 5, 2024, 11:29 a.m. UTC | #2
On 02/02/2024 17:25, Darrick J. Wong wrote:
> On Wed, Jan 24, 2024 at 02:26:40PM +0000, John Garry wrote:
>> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
>> bio is being created and all the rules there need to be followed.
>>
>> It is the task of the FS iomap iter callbacks to ensure that the mapping
>> created adheres to those rules, like size is power-of-2, is at a
>> naturally-aligned offset, etc. However, checking for a single iovec, i.e.
>> iter type is ubuf, is done in __iomap_dio_rw().
>>
>> A write should only produce a single bio, so error when it doesn't.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
>>   fs/iomap/trace.h      |  3 ++-
>>   include/linux/iomap.h |  1 +
>>   3 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index bcd3f8cf5ea4..25736d01b857 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -275,10 +275,12 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>>   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		struct iomap_dio *dio)
>>   {
>> +	bool atomic_write = iter->flags & IOMAP_ATOMIC;
>>   	const struct iomap *iomap = &iter->iomap;
>>   	struct inode *inode = iter->inode;
>>   	unsigned int fs_block_size = i_blocksize(inode), pad;
>>   	loff_t length = iomap_length(iter);
>> +	const size_t iter_len = iter->len;
>>   	loff_t pos = iter->pos;
>>   	blk_opf_t bio_opf;
>>   	struct bio *bio;
>> @@ -381,6 +383,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   					  GFP_KERNEL);
>>   		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>>   		bio->bi_ioprio = dio->iocb->ki_ioprio;
>> +		if (atomic_write)
>> +			bio->bi_opf |= REQ_ATOMIC;
> 
> This really ought to be in iomap_dio_bio_opflags.  Unless you can't pass
> REQ_ATOMIC to bio_alloc*, in which case there ought to be a comment
> about why.

I think that should be ok

> 
> Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC? 

REQ_ATOMIC will be ignored for REQ_OP_READ. I'm following the same 
policy as something like RWF_SYNC for a read.

However, if FMODE_CAN_ATOMIC_WRITE is unset, then REQ_ATOMIC will be 
rejected for both REQ_OP_READ and REQ_OP_WRITE.

> Does that
> actually work?  I don't know what that means, and "block: Add REQ_ATOMIC
> flag" says that's not a valid combination.  I'll complain about this
> more below.

Please note that I do mention that this flag is only meaningful for 
pwritev2(), like RWF_SYNC, here:
https://lore.kernel.org/linux-api/20240124112731.28579-3-john.g.garry@oracle.com/

> 
>> +
>>   		bio->bi_private = dio;
>>   		bio->bi_end_io = iomap_dio_bio_end_io;
>>   
>> @@ -397,6 +402,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		}
>>   
>>   		n = bio->bi_iter.bi_size;
>> +		if (atomic_write && n != iter_len) {
> 
> s/iter_len/orig_len/ ?

ok, I can change the name if you prefer

> 
>> +			/* This bio should have covered the complete length */
>> +			ret = -EINVAL;
>> +			bio_put(bio);
>> +			goto out;
>> +		}
>>   		if (dio->flags & IOMAP_DIO_WRITE) {
>>   			task_io_account_write(n);
>>   		} else {
>> @@ -554,12 +565,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>   	struct blk_plug plug;
>>   	struct iomap_dio *dio;
>>   	loff_t ret = 0;
>> +	bool is_read = iov_iter_rw(iter) == READ;
>> +	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
> 
> Hrmm.  So if the caller passes in an IOCB_ATOMIC iocb with a READ iter,
> we'll silently drop IOCB_ATOMIC and do the read anyway?  That seems like
> a nonsense combination, but is that ok for some reason?

Please see above

> 
>>   	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>>   
>>   	if (!iomi.len)
>>   		return NULL;
>>   
>> +	if (atomic_write && !iter_is_ubuf(iter))
>> +		return ERR_PTR(-EINVAL);
> 
> Does !iter_is_ubuf actually happen? 

Sure, if someone uses iovcnt > 1 for pwritev2

Please see __import_iovec(), where only if iovcnt == 1 we create 
iter_type == ITER_UBUF, if > 1 then we have iter_type == ITER_IOVEC

> Why don't we support any of the
> other ITER_ types?  Is it because hardware doesn't want vectored
> buffers?
It's related how we can determine atomic_write_unit_max for the bdev.

We want to give a definitive max write value which we can guarantee to 
always fit in a BIO, but not mandate any extra special iovec 
length/alignment rules.

Without any iovec length or alignment rules (apart from direct IO rules 
that an iovec needs to be bdev logical block size and length aligned) , 
if a user provides many iovecs, then we may only be able to only fit 
bdev LBS of data (typically 512B) in each BIO vector, and thus we need 
to give a pessimistically low atomic_write_unit_max value.

If we say that iovcnt max == 1, then we know that we can fit PAGE size 
of data in each BIO vector (ignoring first/last vectors), and this will 
give a reasonably large atomic_write_unit_max value.

Note that we do now provide this iovcnt max value via statx, but always 
return 1 for now. This was agreed with Christoph, please see:
https://lore.kernel.org/linux-nvme/20240117150200.GA30112@lst.de/

> 
> I really wish there was more commenting on /why/ we do things here:
> 
> 	if (iocb->ki_flags & IOCB_ATOMIC) {
> 		/* atomic reads do not make sense */
> 		if (iov_iter_rw(iter) == READ)
> 			return ERR_PTR(-EINVAL);
> 
> 		/*
> 		 * block layer doesn't want to handle handle vectors of
> 		 * buffers when performing an atomic write i guess?
> 		 */
> 		if (!iter_is_ubuf(iter))
> 			return ERR_PTR(-EINVAL);
> 
> 		iomi.flags |= IOMAP_ATOMIC;
> 	}

ok, I can make this more clear.

Note: It would be nice if we could check this in 
xfs_iomap_write_direct() or a common VFS helper (which 
xfs_iomap_write_direct() calls), but iter is not available there.

I could just check iter_is_ubuf() on its own in the vfs rw path, but I 
would like to keep the checks as close together as possible.

Thanks,
John
Pankaj Raghav (Samsung) Feb. 5, 2024, 3:20 p.m. UTC | #3
On Wed, Jan 24, 2024 at 02:26:40PM +0000, John Garry wrote:
> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> bio is being created and all the rules there need to be followed.
> 
> It is the task of the FS iomap iter callbacks to ensure that the mapping
> created adheres to those rules, like size is power-of-2, is at a
> naturally-aligned offset, etc. However, checking for a single iovec, i.e.
> iter type is ubuf, is done in __iomap_dio_rw().
> 
> A write should only produce a single bio, so error when it doesn't.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
>  fs/iomap/trace.h      |  3 ++-
>  include/linux/iomap.h |  1 +
>  3 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index bcd3f8cf5ea4..25736d01b857 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -275,10 +275,12 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>  		struct iomap_dio *dio)
>  {
> +	bool atomic_write = iter->flags & IOMAP_ATOMIC;

Minor nit: the commit says IOMAP_ATOMIC_WRITE and you set the enum as
IOMAP_ATOMIC in the code.

As the atomic semantics only apply to write, the commit could be just
reworded to reflect the code?

<snip>
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index c16fd55f5595..c95576420bca 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>  	{ IOMAP_REPORT,		"REPORT" }, \
>  	{ IOMAP_FAULT,		"FAULT" }, \
>  	{ IOMAP_DIRECT,		"DIRECT" }, \
> -	{ IOMAP_NOWAIT,		"NOWAIT" }
> +	{ IOMAP_NOWAIT,		"NOWAIT" }, \
> +	{ IOMAP_ATOMIC,		"ATOMIC" }
>
John Garry Feb. 5, 2024, 3:41 p.m. UTC | #4
On 05/02/2024 15:20, Pankaj Raghav (Samsung) wrote:
> On Wed, Jan 24, 2024 at 02:26:40PM +0000, John Garry wrote:
>> Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
>> bio is being created and all the rules there need to be followed.
>>
>> It is the task of the FS iomap iter callbacks to ensure that the mapping
>> created adheres to those rules, like size is power-of-2, is at a
>> naturally-aligned offset, etc. However, checking for a single iovec, i.e.
>> iter type is ubuf, is done in __iomap_dio_rw().
>>
>> A write should only produce a single bio, so error when it doesn't.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
>>   fs/iomap/trace.h      |  3 ++-
>>   include/linux/iomap.h |  1 +
>>   3 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index bcd3f8cf5ea4..25736d01b857 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -275,10 +275,12 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
>>   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
>>   		struct iomap_dio *dio)
>>   {
>> +	bool atomic_write = iter->flags & IOMAP_ATOMIC;
> 
> Minor nit: the commit says IOMAP_ATOMIC_WRITE and you set the enum as
> IOMAP_ATOMIC in the code.

Thanks for spotting this

> 
> As the atomic semantics only apply to write, the commit could be just
> reworded to reflect the code?

Yes, so I was advised to change IOMAP_ATOMIC_WRITE  -> IOMAP_ATOMIC, as 
this flag is just a write modifier. I just didn't update the commit message.

Thanks,
John

> 
> <snip>
>> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
>> index c16fd55f5595..c95576420bca 100644
>> --- a/fs/iomap/trace.h
>> +++ b/fs/iomap/trace.h
>> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>>   	{ IOMAP_REPORT,		"REPORT" }, \
>>   	{ IOMAP_FAULT,		"FAULT" }, \
>>   	{ IOMAP_DIRECT,		"DIRECT" }, \
>> -	{ IOMAP_NOWAIT,		"NOWAIT" }
>> +	{ IOMAP_NOWAIT,		"NOWAIT" }, \
>> +	{ IOMAP_ATOMIC,		"ATOMIC" }
>>
Christoph Hellwig Feb. 13, 2024, 6:55 a.m. UTC | #5
On Mon, Feb 05, 2024 at 11:29:57AM +0000, John Garry wrote:
>>
>> Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC? 
>
> REQ_ATOMIC will be ignored for REQ_OP_READ. I'm following the same policy 
> as something like RWF_SYNC for a read.

We've been rather sloppy with these flags in the past, which isn't
a good thing.  Let's add proper checking for new interfaces.
John Garry Feb. 13, 2024, 8:20 a.m. UTC | #6
On 13/02/2024 06:55, Christoph Hellwig wrote:
> On Mon, Feb 05, 2024 at 11:29:57AM +0000, John Garry wrote:
>>> Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC?
>> REQ_ATOMIC will be ignored for REQ_OP_READ. I'm following the same policy
>> as something like RWF_SYNC for a read.
> We've been rather sloppy with these flags in the past, which isn't
> a good thing.  Let's add proper checking for new interfaces.

ok, fine.

Thanks,
John
Darrick J. Wong Feb. 13, 2024, 6:08 p.m. UTC | #7
On Mon, Feb 05, 2024 at 11:29:57AM +0000, John Garry wrote:
> On 02/02/2024 17:25, Darrick J. Wong wrote:
> > On Wed, Jan 24, 2024 at 02:26:40PM +0000, John Garry wrote:
> > > Add flag IOMAP_ATOMIC_WRITE to indicate to the FS that an atomic write
> > > bio is being created and all the rules there need to be followed.
> > > 
> > > It is the task of the FS iomap iter callbacks to ensure that the mapping
> > > created adheres to those rules, like size is power-of-2, is at a
> > > naturally-aligned offset, etc. However, checking for a single iovec, i.e.
> > > iter type is ubuf, is done in __iomap_dio_rw().
> > > 
> > > A write should only produce a single bio, so error when it doesn't.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   fs/iomap/direct-io.c  | 21 ++++++++++++++++++++-
> > >   fs/iomap/trace.h      |  3 ++-
> > >   include/linux/iomap.h |  1 +
> > >   3 files changed, 23 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > > index bcd3f8cf5ea4..25736d01b857 100644
> > > --- a/fs/iomap/direct-io.c
> > > +++ b/fs/iomap/direct-io.c
> > > @@ -275,10 +275,12 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> > >   static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   		struct iomap_dio *dio)
> > >   {
> > > +	bool atomic_write = iter->flags & IOMAP_ATOMIC;
> > >   	const struct iomap *iomap = &iter->iomap;
> > >   	struct inode *inode = iter->inode;
> > >   	unsigned int fs_block_size = i_blocksize(inode), pad;
> > >   	loff_t length = iomap_length(iter);
> > > +	const size_t iter_len = iter->len;
> > >   	loff_t pos = iter->pos;
> > >   	blk_opf_t bio_opf;
> > >   	struct bio *bio;
> > > @@ -381,6 +383,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   					  GFP_KERNEL);
> > >   		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> > >   		bio->bi_ioprio = dio->iocb->ki_ioprio;
> > > +		if (atomic_write)
> > > +			bio->bi_opf |= REQ_ATOMIC;
> > 
> > This really ought to be in iomap_dio_bio_opflags.  Unless you can't pass
> > REQ_ATOMIC to bio_alloc*, in which case there ought to be a comment
> > about why.
> 
> I think that should be ok
> 
> > 
> > Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC?
> 
> REQ_ATOMIC will be ignored for REQ_OP_READ. I'm following the same policy as
> something like RWF_SYNC for a read.
> 
> However, if FMODE_CAN_ATOMIC_WRITE is unset, then REQ_ATOMIC will be
> rejected for both REQ_OP_READ and REQ_OP_WRITE.
> 
> > Does that
> > actually work?  I don't know what that means, and "block: Add REQ_ATOMIC
> > flag" says that's not a valid combination.  I'll complain about this
> > more below.
> 
> Please note that I do mention that this flag is only meaningful for
> pwritev2(), like RWF_SYNC, here:
> https://lore.kernel.org/linux-api/20240124112731.28579-3-john.g.garry@oracle.com/
> 
> > 
> > > +
> > >   		bio->bi_private = dio;
> > >   		bio->bi_end_io = iomap_dio_bio_end_io;
> > > @@ -397,6 +402,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> > >   		}
> > >   		n = bio->bi_iter.bi_size;
> > > +		if (atomic_write && n != iter_len) {
> > 
> > s/iter_len/orig_len/ ?
> 
> ok, I can change the name if you prefer
> 
> > 
> > > +			/* This bio should have covered the complete length */
> > > +			ret = -EINVAL;
> > > +			bio_put(bio);
> > > +			goto out;
> > > +		}
> > >   		if (dio->flags & IOMAP_DIO_WRITE) {
> > >   			task_io_account_write(n);
> > >   		} else {
> > > @@ -554,12 +565,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> > >   	struct blk_plug plug;
> > >   	struct iomap_dio *dio;
> > >   	loff_t ret = 0;
> > > +	bool is_read = iov_iter_rw(iter) == READ;
> > > +	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
> > 
> > Hrmm.  So if the caller passes in an IOCB_ATOMIC iocb with a READ iter,
> > we'll silently drop IOCB_ATOMIC and do the read anyway?  That seems like
> > a nonsense combination, but is that ok for some reason?
> 
> Please see above
> 
> > 
> > >   	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
> > >   	if (!iomi.len)
> > >   		return NULL;
> > > +	if (atomic_write && !iter_is_ubuf(iter))
> > > +		return ERR_PTR(-EINVAL);
> > 
> > Does !iter_is_ubuf actually happen?
> 
> Sure, if someone uses iovcnt > 1 for pwritev2
> 
> Please see __import_iovec(), where only if iovcnt == 1 we create iter_type
> == ITER_UBUF, if > 1 then we have iter_type == ITER_IOVEC

Ok.  The iter stuff (especially the macros) confuse the hell out of me
every time I go reading through that.

> > Why don't we support any of the
> > other ITER_ types?  Is it because hardware doesn't want vectored
> > buffers?
> It's related how we can determine atomic_write_unit_max for the bdev.
> 
> We want to give a definitive max write value which we can guarantee to
> always fit in a BIO, but not mandate any extra special iovec
> length/alignment rules.
> 
> Without any iovec length or alignment rules (apart from direct IO rules that
> an iovec needs to be bdev logical block size and length aligned) , if a user
> provides many iovecs, then we may only be able to only fit bdev LBS of data
> (typically 512B) in each BIO vector, and thus we need to give a
> pessimistically low atomic_write_unit_max value.
> 
> If we say that iovcnt max == 1, then we know that we can fit PAGE size of
> data in each BIO vector (ignoring first/last vectors), and this will give a
> reasonably large atomic_write_unit_max value.
> 
> Note that we do now provide this iovcnt max value via statx, but always
> return 1 for now. This was agreed with Christoph, please see:
> https://lore.kernel.org/linux-nvme/20240117150200.GA30112@lst.de/

Got it.  We can always add ITER_IOVEC support later if we figure out a
sane way to restrain userspace. :)

> > 
> > I really wish there was more commenting on /why/ we do things here:
> > 
> > 	if (iocb->ki_flags & IOCB_ATOMIC) {
> > 		/* atomic reads do not make sense */
> > 		if (iov_iter_rw(iter) == READ)
> > 			return ERR_PTR(-EINVAL);
> > 
> > 		/*
> > 		 * block layer doesn't want to handle handle vectors of
> > 		 * buffers when performing an atomic write i guess?
> > 		 */
> > 		if (!iter_is_ubuf(iter))
> > 			return ERR_PTR(-EINVAL);
> > 
> > 		iomi.flags |= IOMAP_ATOMIC;
> > 	}
> 
> ok, I can make this more clear.
> 
> Note: It would be nice if we could check this in xfs_iomap_write_direct() or
> a common VFS helper (which xfs_iomap_write_direct() calls), but iter is not
> available there.

No, do not put generic stuff like that in XFS, leave it here in iomap.

> I could just check iter_is_ubuf() on its own in the vfs rw path, but I would
> like to keep the checks as close together as possible.

Yeah, and I want you to put as many of the checks in the VFS as
possible so that we (or really Ojaswin) don't end up copy-pasting all
that validation into ext4 and every other filesystem that wants to
expose untorn writes.

AFAICT the only things XFS really needs to do on its own is check that
the xfs_inode_alloc_unit() is a power of two if untorn writes are
present; and adjusting the awu min/max for statx reporting.

--D

> Thanks,
> John
>
John Garry Feb. 15, 2024, 11:08 a.m. UTC | #8
On 13/02/2024 08:20, John Garry wrote:
> On 13/02/2024 06:55, Christoph Hellwig wrote:
>> On Mon, Feb 05, 2024 at 11:29:57AM +0000, John Garry wrote:
>>>> Also, what's the meaning of REQ_OP_READ | REQ_ATOMIC?
>>> REQ_ATOMIC will be ignored for REQ_OP_READ. I'm following the same 
>>> policy
>>> as something like RWF_SYNC for a read.
>> We've been rather sloppy with these flags in the past, which isn't
>> a good thing.  Let's add proper checking for new interfaces.

How about something like this:

----8<----

-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int 
type)
  {
   int kiocb_flags = 0;

...

+ if (flags & RWF_ATOMIC) {
+ 	if (type == READ)
+ 		return -EOPNOTSUPP;
+ 	if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+ 		return -EOPNOTSUPP;
+ }
   kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
   if (flags & RWF_SYNC)
  	 kiocb_flags |= IOCB_DSYNC;

---->8----

I don't see a better place to add this check.

John
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..25736d01b857 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -275,10 +275,12 @@  static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		struct iomap_dio *dio)
 {
+	bool atomic_write = iter->flags & IOMAP_ATOMIC;
 	const struct iomap *iomap = &iter->iomap;
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
 	loff_t length = iomap_length(iter);
+	const size_t iter_len = iter->len;
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
@@ -381,6 +383,9 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 					  GFP_KERNEL);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		if (atomic_write)
+			bio->bi_opf |= REQ_ATOMIC;
+
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -397,6 +402,12 @@  static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+		if (atomic_write && n != iter_len) {
+			/* This bio should have covered the complete length */
+			ret = -EINVAL;
+			bio_put(bio);
+			goto out;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {
@@ -554,12 +565,17 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 	loff_t ret = 0;
+	bool is_read = iov_iter_rw(iter) == READ;
+	bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC) && !is_read;
 
 	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
 
 	if (!iomi.len)
 		return NULL;
 
+	if (atomic_write && !iter_is_ubuf(iter))
+		return ERR_PTR(-EINVAL);
+
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	if (!dio)
 		return ERR_PTR(-ENOMEM);
@@ -579,7 +595,7 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
-	if (iov_iter_rw(iter) == READ) {
+	if (is_read) {
 		/* reads can always complete inline */
 		dio->flags |= IOMAP_DIO_INLINE_COMP;
 
@@ -605,6 +621,9 @@  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
 			dio->flags |= IOMAP_DIO_CALLER_COMP;
 
+		if (atomic_write)
+			iomi.flags |= IOMAP_ATOMIC;
+
 		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
 			ret = -EAGAIN;
 			if (iomi.pos >= dio->i_size ||
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..c95576420bca 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@  DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..9eac704a0d6f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@  struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*