diff mbox series

[v2,10/11] iomap: Add support for zone append writes

Message ID 20200324152454.4954-11-johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series Introduce Zone Append for writing to zoned block devices | expand

Commit Message

Johannes Thumshirn March 24, 2020, 3:24 p.m. UTC
Use REQ_OP_ZONE_APPEND for direct I/O write BIOs, instead of REQ_OP_WRITE
if the file-system requests it. The file system can make this request
by setting the new flag IOCB_ZONE_APPEND for a direct I/O kiocb before
calling iompa_dio_rw(). Using this information, this function propagates
the zone append flag using IOMAP_ZONE_APPEND to the file system
iomap_begin() method. The BIOs submitted for the zone append DIO will be
set to use the REQ_OP_ZONE_APPEND operation.

Since zone append operations cannot be split, the iomap_apply() and
iomap_dio_rw() internal loops are executed only once, which may result
in short writes.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/iomap/direct-io.c  | 80 ++++++++++++++++++++++++++++++++++++-------
 include/linux/fs.h    |  1 +
 include/linux/iomap.h | 22 ++++++------
 3 files changed, 79 insertions(+), 24 deletions(-)

Comments

Christoph Hellwig March 24, 2020, 3:41 p.m. UTC | #1
On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote:
> @@ -39,6 +40,7 @@ struct iomap_dio {
>  			struct task_struct	*waiter;
>  			struct request_queue	*last_queue;
>  			blk_qc_t		cookie;
> +			sector_t		sector;
>  		} submit;
>  
>  		/* used for aio completion: */
> @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
>  	if (bio->bi_status)
>  		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
>  
> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
> +		dio->submit.sector = bio->bi_iter.bi_sector;

The submit member in struct iomap_dio is for submit-time information,
while this is used in the completion path..

>  		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
>  		iomap_dio_submit_bio(dio, iomap, bio);
> +
> +		/*
> +		 * Issuing multiple BIOs for a large zone append write can
> +		 * result in reordering of the write fragments and to data
> +		 * corruption. So always stop after the first BIO is issued.
> +		 */
> +		if (zone_append)
> +			break;

At least for a normal file system that is absolutely not true.  If
zonefs is so special it might be better of just using a slightly tweaked
copy of blkdev_direct_IO rather than using iomap.

> @@ -446,6 +486,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		flags |= IOMAP_WRITE;
>  		dio->flags |= IOMAP_DIO_WRITE;
>  
> +		if (iocb->ki_flags & IOCB_ZONE_APPEND) {
> +			flags |= IOMAP_ZONE_APPEND;
> +			dio->flags |= IOMAP_DIO_ZONE_APPEND;
> +		}
> +
>  		/* for data sync or sync, we need sync completion processing */
>  		if (iocb->ki_flags & IOCB_DSYNC)
>  			dio->flags |= IOMAP_DIO_NEED_SYNC;
> @@ -516,6 +561,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  			iov_iter_revert(iter, pos - dio->i_size);
>  			break;
>  		}
> +
> +		/*
> +		 * Zone append writes cannot be split and be shorted. Break
> +		 * here to let the user know instead of sending more IOs which
> +		 * could get reordered and corrupt the written data.
> +		 */
> +		if (flags & IOMAP_ZONE_APPEND)
> +			break;

But that isn't what we do here.  You exit after a single apply iteration
which is perfectly fine - at at least for a normal file system, zonefs
is rather weird.

> +
>  	} while ((count = iov_iter_count(iter)) > 0);
>  	blk_finish_plug(&plug);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 3cd4fe6b845e..aa4ad705e549 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -314,6 +314,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ZONE_APPEND	(1 << 8)

I don't think the iocb is the right interface for passing this
kind of information.  We currently pass a bool wait to iomap_dio_rw
which really should be flags.  I have a pending patch for that.

> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 8b09463dae0d..16c17a79e53d 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -68,7 +68,6 @@ struct vm_fault;
>   */
>  #define IOMAP_F_PRIVATE		0x1000
>  
> -

Spurious whitespace change.

>  /*
>   * Magic value for addr:
>   */
> @@ -95,6 +94,17 @@ iomap_sector(struct iomap *iomap, loff_t pos)
>  	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
>  }
>  
> +/*
> + * Flags for iomap_begin / iomap_end.  No flag implies a read.
> + */
> +#define IOMAP_WRITE		(1 << 0) /* writing, must allocate blocks */
> +#define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
> +#define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
> +#define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
> +#define IOMAP_DIRECT		(1 << 4) /* direct I/O */
> +#define IOMAP_NOWAIT		(1 << 5) /* do not block */
> +#define IOMAP_ZONE_APPEND	(1 << 6) /* Use zone append for writes */

Why is this moved around?
Dave Chinner March 24, 2020, 10:45 p.m. UTC | #2
On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote:
> Use REQ_OP_ZONE_APPEND for direct I/O write BIOs, instead of REQ_OP_WRITE
> if the file-system requests it. The file system can make this request
> by setting the new flag IOCB_ZONE_APPEND for a direct I/O kiocb before
> calling iompa_dio_rw(). Using this information, this function propagates
> the zone append flag using IOMAP_ZONE_APPEND to the file system
> iomap_begin() method. The BIOs submitted for the zone append DIO will be
> set to use the REQ_OP_ZONE_APPEND operation.
> 
> Since zone append operations cannot be split, the iomap_apply() and
> iomap_dio_rw() internal loops are executed only once, which may result
> in short writes.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/iomap/direct-io.c  | 80 ++++++++++++++++++++++++++++++++++++-------
>  include/linux/fs.h    |  1 +
>  include/linux/iomap.h | 22 ++++++------
>  3 files changed, 79 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 23837926c0c5..b3e2aadce72f 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -17,6 +17,7 @@
>   * Private flags for iomap_dio, must not overlap with the public ones in
>   * iomap.h:
>   */
> +#define IOMAP_DIO_ZONE_APPEND	(1 << 27)
>  #define IOMAP_DIO_WRITE_FUA	(1 << 28)
>  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
>  #define IOMAP_DIO_WRITE		(1 << 30)
> @@ -39,6 +40,7 @@ struct iomap_dio {
>  			struct task_struct	*waiter;
>  			struct request_queue	*last_queue;
>  			blk_qc_t		cookie;
> +			sector_t		sector;
>  		} submit;
>  
>  		/* used for aio completion: */
> @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
>  	if (bio->bi_status)
>  		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
>  
> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
> +		dio->submit.sector = bio->bi_iter.bi_sector;
> +
>  	if (atomic_dec_and_test(&dio->ref)) {
>  		if (dio->wait_for_completion) {
>  			struct task_struct *waiter = dio->submit.waiter;
> @@ -194,6 +199,21 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>  	iomap_dio_submit_bio(dio, iomap, bio);
>  }
>  
> +static sector_t
> +iomap_dio_bio_sector(struct iomap_dio *dio, struct iomap *iomap, loff_t pos)
> +{
> +	sector_t sector = iomap_sector(iomap, pos);
> +
> +	/*
> +	 * For zone append writes, the BIO needs to point at the start of the
> +	 * zone to append to.
> +	 */
> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
> +		sector = ALIGN_DOWN(sector, bdev_zone_sectors(iomap->bdev));
> +
> +	return sector;
> +}

This seems to me like it should be done by the ->iomap_begin
implementation when mapping the IO. I don't see why this needs to be
specially handled by the iomap dio code.

> +
>  static loff_t
>  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		struct iomap_dio *dio, struct iomap *iomap)
> @@ -204,6 +224,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  	struct bio *bio;
>  	bool need_zeroout = false;
>  	bool use_fua = false;
> +	bool zone_append = false;
>  	int nr_pages, ret = 0;
>  	size_t copied = 0;
>  	size_t orig_count;
> @@ -235,6 +256,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  			use_fua = true;
>  	}
>  
> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
> +		zone_append = true;
> +
>  	/*
>  	 * Save the original count and trim the iter to just the extent we
>  	 * are operating on right now.  The iter will be re-expanded once
> @@ -266,12 +290,28 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  
>  		bio = bio_alloc(GFP_KERNEL, nr_pages);
>  		bio_set_dev(bio, iomap->bdev);
> -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> +		bio->bi_iter.bi_sector = iomap_dio_bio_sector(dio, iomap, pos);
>  		bio->bi_write_hint = dio->iocb->ki_hint;
>  		bio->bi_ioprio = dio->iocb->ki_ioprio;
>  		bio->bi_private = dio;
>  		bio->bi_end_io = iomap_dio_bio_end_io;
>  
> +		if (dio->flags & IOMAP_DIO_WRITE) {
> +			bio->bi_opf = REQ_SYNC | REQ_IDLE;
> +			if (zone_append)
> +				bio->bi_opf |= REQ_OP_ZONE_APPEND;
> +			else
> +				bio->bi_opf |= REQ_OP_WRITE;
> +			if (use_fua)
> +				bio->bi_opf |= REQ_FUA;
> +			else
> +				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
> +		} else {
> +			bio->bi_opf = REQ_OP_READ;
> +			if (dio->flags & IOMAP_DIO_DIRTY)
> +				bio_set_pages_dirty(bio);
> +		}

Why move all this code? If it's needed, please split it into a
separate patchi to separate it from the new functionality...

> +
>  		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
>  		if (unlikely(ret)) {
>  			/*
> @@ -284,19 +324,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  			goto zero_tail;
>  		}
>  
> -		n = bio->bi_iter.bi_size;
> -		if (dio->flags & IOMAP_DIO_WRITE) {
> -			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
> -			if (use_fua)
> -				bio->bi_opf |= REQ_FUA;
> -			else
> -				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
> +		if (dio->flags & IOMAP_DIO_WRITE)
>  			task_io_account_write(n);
> -		} else {
> -			bio->bi_opf = REQ_OP_READ;
> -			if (dio->flags & IOMAP_DIO_DIRTY)
> -				bio_set_pages_dirty(bio);
> -		}
> +
> +		n = bio->bi_iter.bi_size;
>  
>  		dio->size += n;
>  		pos += n;
> @@ -304,6 +335,15 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  
>  		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
>  		iomap_dio_submit_bio(dio, iomap, bio);
> +
> +		/*
> +		 * Issuing multiple BIOs for a large zone append write can
> +		 * result in reordering of the write fragments and to data
> +		 * corruption. So always stop after the first BIO is issued.
> +		 */
> +		if (zone_append)
> +			break;

I don't think this sort of functionality should be tied to "zone
append". If there is a need for "issue a single (short) bio only" it
should be a flag to iomap_dio_rw() set by the filesystem, which can
then handle the short read/write that is returned.

> +		/*
> +		 * Zone append writes cannot be split and be shorted. Break
> +		 * here to let the user know instead of sending more IOs which
> +		 * could get reordered and corrupt the written data.
> +		 */
> +		if (flags & IOMAP_ZONE_APPEND)
> +			break;

ditto.

Cheers,

Dave.
Damien Le Moal March 25, 2020, 5:27 a.m. UTC | #3
On 2020/03/25 0:41, Christoph Hellwig wrote:
> On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote:
>> @@ -39,6 +40,7 @@ struct iomap_dio {
>>  			struct task_struct	*waiter;
>>  			struct request_queue	*last_queue;
>>  			blk_qc_t		cookie;
>> +			sector_t		sector;
>>  		} submit;
>>  
>>  		/* used for aio completion: */
>> @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
>>  	if (bio->bi_status)
>>  		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
>>  
>> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
>> +		dio->submit.sector = bio->bi_iter.bi_sector;
> 
> The submit member in struct iomap_dio is for submit-time information,
> while this is used in the completion path..
> 
>>  		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
>>  		iomap_dio_submit_bio(dio, iomap, bio);
>> +
>> +		/*
>> +		 * Issuing multiple BIOs for a large zone append write can
>> +		 * result in reordering of the write fragments and to data
>> +		 * corruption. So always stop after the first BIO is issued.
>> +		 */
>> +		if (zone_append)
>> +			break;
> 
> At least for a normal file system that is absolutely not true.  If
> zonefs is so special it might be better of just using a slightly tweaked
> copy of blkdev_direct_IO rather than using iomap.

It would be very nice to not have to add this direct BIO use case in zonefs
since that would be only for writes to sequential zones while all other
operations use iomap. So instead of this, what about using a flag as Dave
suggested (see below comment too) ?

> 
>> @@ -446,6 +486,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>  		flags |= IOMAP_WRITE;
>>  		dio->flags |= IOMAP_DIO_WRITE;
>>  
>> +		if (iocb->ki_flags & IOCB_ZONE_APPEND) {
>> +			flags |= IOMAP_ZONE_APPEND;
>> +			dio->flags |= IOMAP_DIO_ZONE_APPEND;
>> +		}
>> +
>>  		/* for data sync or sync, we need sync completion processing */
>>  		if (iocb->ki_flags & IOCB_DSYNC)
>>  			dio->flags |= IOMAP_DIO_NEED_SYNC;
>> @@ -516,6 +561,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>>  			iov_iter_revert(iter, pos - dio->i_size);
>>  			break;
>>  		}
>> +
>> +		/*
>> +		 * Zone append writes cannot be split and be shorted. Break
>> +		 * here to let the user know instead of sending more IOs which
>> +		 * could get reordered and corrupt the written data.
>> +		 */
>> +		if (flags & IOMAP_ZONE_APPEND)
>> +			break;
> 
> But that isn't what we do here.  You exit after a single apply iteration
> which is perfectly fine - at at least for a normal file system, zonefs
> is rather weird.

The comment is indeed not clear. For the short write, as Dave suggested, we
should have a special flag for it. So would you be OK if we replace this with
something like

		if (flags & IOMAP_SHORT_WRITE)
			break;

Normal file systems with real block mapping metadata would not need this as they
can perfectly handle non sequential zone append fragments of a large DIO. But
zonefs will need the short writes since it lacks file block mapping metadata.

> 
>> +
>>  	} while ((count = iov_iter_count(iter)) > 0);
>>  	blk_finish_plug(&plug);
>>  
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 3cd4fe6b845e..aa4ad705e549 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -314,6 +314,7 @@ enum rw_hint {
>>  #define IOCB_SYNC		(1 << 5)
>>  #define IOCB_WRITE		(1 << 6)
>>  #define IOCB_NOWAIT		(1 << 7)
>> +#define IOCB_ZONE_APPEND	(1 << 8)
> 
> I don't think the iocb is the right interface for passing this
> kind of information.  We currently pass a bool wait to iomap_dio_rw
> which really should be flags.  I have a pending patch for that.

Is that patch queued in iomap or xfs tree ? Could you point us to it please ?

> 
>> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
>> index 8b09463dae0d..16c17a79e53d 100644
>> --- a/include/linux/iomap.h
>> +++ b/include/linux/iomap.h
>> @@ -68,7 +68,6 @@ struct vm_fault;
>>   */
>>  #define IOMAP_F_PRIVATE		0x1000
>>  
>> -
> 
> Spurious whitespace change.
> 
>>  /*
>>   * Magic value for addr:
>>   */
>> @@ -95,6 +94,17 @@ iomap_sector(struct iomap *iomap, loff_t pos)
>>  	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
>>  }
>>  
>> +/*
>> + * Flags for iomap_begin / iomap_end.  No flag implies a read.
>> + */
>> +#define IOMAP_WRITE		(1 << 0) /* writing, must allocate blocks */
>> +#define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
>> +#define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
>> +#define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
>> +#define IOMAP_DIRECT		(1 << 4) /* direct I/O */
>> +#define IOMAP_NOWAIT		(1 << 5) /* do not block */
>> +#define IOMAP_ZONE_APPEND	(1 << 6) /* Use zone append for writes */
> 
> Why is this moved around?
>
Damien Le Moal March 25, 2020, 5:38 a.m. UTC | #4
On 2020/03/25 7:46, Dave Chinner wrote:
> On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote:
>> Use REQ_OP_ZONE_APPEND for direct I/O write BIOs, instead of REQ_OP_WRITE
>> if the file-system requests it. The file system can make this request
>> by setting the new flag IOCB_ZONE_APPEND for a direct I/O kiocb before
>> calling iompa_dio_rw(). Using this information, this function propagates
>> the zone append flag using IOMAP_ZONE_APPEND to the file system
>> iomap_begin() method. The BIOs submitted for the zone append DIO will be
>> set to use the REQ_OP_ZONE_APPEND operation.
>>
>> Since zone append operations cannot be split, the iomap_apply() and
>> iomap_dio_rw() internal loops are executed only once, which may result
>> in short writes.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>  fs/iomap/direct-io.c  | 80 ++++++++++++++++++++++++++++++++++++-------
>>  include/linux/fs.h    |  1 +
>>  include/linux/iomap.h | 22 ++++++------
>>  3 files changed, 79 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index 23837926c0c5..b3e2aadce72f 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -17,6 +17,7 @@
>>   * Private flags for iomap_dio, must not overlap with the public ones in
>>   * iomap.h:
>>   */
>> +#define IOMAP_DIO_ZONE_APPEND	(1 << 27)
>>  #define IOMAP_DIO_WRITE_FUA	(1 << 28)
>>  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
>>  #define IOMAP_DIO_WRITE		(1 << 30)
>> @@ -39,6 +40,7 @@ struct iomap_dio {
>>  			struct task_struct	*waiter;
>>  			struct request_queue	*last_queue;
>>  			blk_qc_t		cookie;
>> +			sector_t		sector;
>>  		} submit;
>>  
>>  		/* used for aio completion: */
>> @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
>>  	if (bio->bi_status)
>>  		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
>>  
>> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
>> +		dio->submit.sector = bio->bi_iter.bi_sector;
>> +
>>  	if (atomic_dec_and_test(&dio->ref)) {
>>  		if (dio->wait_for_completion) {
>>  			struct task_struct *waiter = dio->submit.waiter;
>> @@ -194,6 +199,21 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>>  	iomap_dio_submit_bio(dio, iomap, bio);
>>  }
>>  
>> +static sector_t
>> +iomap_dio_bio_sector(struct iomap_dio *dio, struct iomap *iomap, loff_t pos)
>> +{
>> +	sector_t sector = iomap_sector(iomap, pos);
>> +
>> +	/*
>> +	 * For zone append writes, the BIO needs to point at the start of the
>> +	 * zone to append to.
>> +	 */
>> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
>> +		sector = ALIGN_DOWN(sector, bdev_zone_sectors(iomap->bdev));
>> +
>> +	return sector;
>> +}
> 
> This seems to me like it should be done by the ->iomap_begin
> implementation when mapping the IO. I don't see why this needs to be
> specially handled by the iomap dio code.

Fair point. However, iomap_sector() does not simply return iomap->addr >> 9. So
that means that in iomap_begin, the mapping address and offset returned needs to
account for the iomap_sector() calculation, i.e.

(iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT

which does not result in a very obvious values for iomap address and offset.
Well I guess for zone append we simply need to set
iomap->offset = pos;
and
iomap->addr = zone start sector;
and that would then work. This however look fragile to me.

> 
>> +
>>  static loff_t
>>  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  		struct iomap_dio *dio, struct iomap *iomap)
>> @@ -204,6 +224,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  	struct bio *bio;
>>  	bool need_zeroout = false;
>>  	bool use_fua = false;
>> +	bool zone_append = false;
>>  	int nr_pages, ret = 0;
>>  	size_t copied = 0;
>>  	size_t orig_count;
>> @@ -235,6 +256,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  			use_fua = true;
>>  	}
>>  
>> +	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
>> +		zone_append = true;
>> +
>>  	/*
>>  	 * Save the original count and trim the iter to just the extent we
>>  	 * are operating on right now.  The iter will be re-expanded once
>> @@ -266,12 +290,28 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  
>>  		bio = bio_alloc(GFP_KERNEL, nr_pages);
>>  		bio_set_dev(bio, iomap->bdev);
>> -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>> +		bio->bi_iter.bi_sector = iomap_dio_bio_sector(dio, iomap, pos);
>>  		bio->bi_write_hint = dio->iocb->ki_hint;
>>  		bio->bi_ioprio = dio->iocb->ki_ioprio;
>>  		bio->bi_private = dio;
>>  		bio->bi_end_io = iomap_dio_bio_end_io;
>>  
>> +		if (dio->flags & IOMAP_DIO_WRITE) {
>> +			bio->bi_opf = REQ_SYNC | REQ_IDLE;
>> +			if (zone_append)
>> +				bio->bi_opf |= REQ_OP_ZONE_APPEND;
>> +			else
>> +				bio->bi_opf |= REQ_OP_WRITE;
>> +			if (use_fua)
>> +				bio->bi_opf |= REQ_FUA;
>> +			else
>> +				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
>> +		} else {
>> +			bio->bi_opf = REQ_OP_READ;
>> +			if (dio->flags & IOMAP_DIO_DIRTY)
>> +				bio_set_pages_dirty(bio);
>> +		}
> 
> Why move all this code? If it's needed, please split it into a
> separate patchi to separate it from the new functionality...

The BIO add page in bio_iov_iter_get_pages() needs to know that the BIO is a
zone append to stop adding pages before the BIO grows too large and result in a
split when it is submitted. So bio->bi_opf needs to be set before calling
bio_iov_iter_get_pages(). So this change is related to the new functionality.
But I guess it is OK to do it regardless of the BIO operation.

> 
>> +
>>  		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
>>  		if (unlikely(ret)) {
>>  			/*
>> @@ -284,19 +324,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  			goto zero_tail;
>>  		}
>>  
>> -		n = bio->bi_iter.bi_size;
>> -		if (dio->flags & IOMAP_DIO_WRITE) {
>> -			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
>> -			if (use_fua)
>> -				bio->bi_opf |= REQ_FUA;
>> -			else
>> -				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
>> +		if (dio->flags & IOMAP_DIO_WRITE)
>>  			task_io_account_write(n);
>> -		} else {
>> -			bio->bi_opf = REQ_OP_READ;
>> -			if (dio->flags & IOMAP_DIO_DIRTY)
>> -				bio_set_pages_dirty(bio);
>> -		}
>> +
>> +		n = bio->bi_iter.bi_size;
>>  
>>  		dio->size += n;
>>  		pos += n;
>> @@ -304,6 +335,15 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>  
>>  		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
>>  		iomap_dio_submit_bio(dio, iomap, bio);
>> +
>> +		/*
>> +		 * Issuing multiple BIOs for a large zone append write can
>> +		 * result in reordering of the write fragments and to data
>> +		 * corruption. So always stop after the first BIO is issued.
>> +		 */
>> +		if (zone_append)
>> +			break;
> 
> I don't think this sort of functionality should be tied to "zone
> append". If there is a need for "issue a single (short) bio only" it
> should be a flag to iomap_dio_rw() set by the filesystem, which can
> then handle the short read/write that is returned.

Yes, that would be cleaner.

> 
>> +		/*
>> +		 * Zone append writes cannot be split and be shorted. Break
>> +		 * here to let the user know instead of sending more IOs which
>> +		 * could get reordered and corrupt the written data.
>> +		 */
>> +		if (flags & IOMAP_ZONE_APPEND)
>> +			break;
> 
> ditto.
> 
> Cheers,
> 
> Dave.
>
Christoph Hellwig March 25, 2020, 6:25 a.m. UTC | #5
On Wed, Mar 25, 2020 at 05:27:38AM +0000, Damien Le Moal wrote:
> > At least for a normal file system that is absolutely not true.  If
> > zonefs is so special it might be better of just using a slightly tweaked
> > copy of blkdev_direct_IO rather than using iomap.
> 
> It would be very nice to not have to add this direct BIO use case in zonefs
> since that would be only for writes to sequential zones while all other
> operations use iomap. So instead of this, what about using a flag as Dave
> suggested (see below comment too) ?

Given how special the use case is I'm not sure overloading iomap
is a good idea.  Think of how a "normal" zone aware file system would
use iomap and not of this will apply.  OTOH the "simple" single bio
code in __blkdev_direct_IO_simple is less than 100 lines of code.  I
think having a specialized code base for a specialized use case
might be better than overloading generic code with tons of flags.

> > I don't think the iocb is the right interface for passing this
> > kind of information.  We currently pass a bool wait to iomap_dio_rw
> > which really should be flags.  I have a pending patch for that.
> 
> Is that patch queued in iomap or xfs tree ? Could you point us to it please ?

It isn't queued up anywhere yet.
Johannes Thumshirn March 25, 2020, 8:32 a.m. UTC | #6
On 24/03/2020 23:46, Dave Chinner wrote:
>> @@ -266,12 +290,28 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>>   
>>   		bio = bio_alloc(GFP_KERNEL, nr_pages);
>>   		bio_set_dev(bio, iomap->bdev);
>> -		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>> +		bio->bi_iter.bi_sector = iomap_dio_bio_sector(dio, iomap, pos);
>>   		bio->bi_write_hint = dio->iocb->ki_hint;
>>   		bio->bi_ioprio = dio->iocb->ki_ioprio;
>>   		bio->bi_private = dio;
>>   		bio->bi_end_io = iomap_dio_bio_end_io;
>>   
>> +		if (dio->flags & IOMAP_DIO_WRITE) {
>> +			bio->bi_opf = REQ_SYNC | REQ_IDLE;
>> +			if (zone_append)
>> +				bio->bi_opf |= REQ_OP_ZONE_APPEND;
>> +			else
>> +				bio->bi_opf |= REQ_OP_WRITE;
>> +			if (use_fua)
>> +				bio->bi_opf |= REQ_FUA;
>> +			else
>> +				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
>> +		} else {
>> +			bio->bi_opf = REQ_OP_READ;
>> +			if (dio->flags & IOMAP_DIO_DIRTY)
>> +				bio_set_pages_dirty(bio);
>> +		}
> Why move all this code? If it's needed, please split it into a
> separate patchi to separate it from the new functionality...
> 

The code is moved as bio_iov_iter_get_pages() needs the correct bi_opf 
set for zone append to be able to check the limits (see patch 03/11, 
block: Introduce REQ_OP_ZONE_APPEND in this series).

The call chain is:
bio_iov_iter_get_pages()
`-> bio_iov_iter_get_pages()
     `-> bio_full()
         `-> bio_can_zone_append()

I'm not sure if separating this movement would make it clearer, apart 
from a commit message?
Johannes Thumshirn March 25, 2020, 9:45 a.m. UTC | #7
On 24/03/2020 16:41, Christoph Hellwig wrote:
>> +
>> +		/*
>> +		 * Issuing multiple BIOs for a large zone append write can
>> +		 * result in reordering of the write fragments and to data
>> +		 * corruption. So always stop after the first BIO is issued.
>> +		 */
>> +		if (zone_append)
>> +			break;
> At least for a normal file system that is absolutely not true.  If
> zonefs is so special it might be better of just using a slightly tweaked
> copy of blkdev_direct_IO rather than using iomap.
> 

Can you please elaborate on that? Why doesn't this hold true for a 
normal file system? If we split the DIO write into multiple BIOs with 
zone-append, there is nothing which guarantees the order of the written 
data (at least as far as I can see).

So if we have this DIO write:
|AAAAA|BBBB|CCCC|DDDD|
and we have to split it for whatever reason, what safe guards us from it 
ending up on disk like this:
|CCCC|DDDD|AAAA|BBBB|

This is essentially the same reason we can't split zone-append BIOs, or 
am I totally off track now?

Thanks,
	Johannes
Christoph Hellwig March 25, 2020, 9:48 a.m. UTC | #8
On Wed, Mar 25, 2020 at 09:45:39AM +0000, Johannes Thumshirn wrote:
> 
> Can you please elaborate on that? Why doesn't this hold true for a 
> normal file system? If we split the DIO write into multiple BIOs with 
> zone-append, there is nothing which guarantees the order of the written 
> data (at least as far as I can see).

Of course nothing gurantees the order.  But the whole point is that the
order does not matter.
Johannes Thumshirn March 25, 2020, 9:54 a.m. UTC | #9
On 25/03/2020 10:48, hch@infradead.org wrote:
> On Wed, Mar 25, 2020 at 09:45:39AM +0000, Johannes Thumshirn wrote:
>>
>> Can you please elaborate on that? Why doesn't this hold true for a
>> normal file system? If we split the DIO write into multiple BIOs with
>> zone-append, there is nothing which guarantees the order of the written
>> data (at least as far as I can see).
> 
> Of course nothing gurantees the order.  But the whole point is that the
> order does not matter.
> 

Ok now I'm totally lost. The order of data inside an extent does matter, 
otherwise we have data corruption.
Damien Le Moal March 25, 2020, 9:59 a.m. UTC | #10
On 2020/03/25 18:48, hch@infradead.org wrote:
> On Wed, Mar 25, 2020 at 09:45:39AM +0000, Johannes Thumshirn wrote:
>>
>> Can you please elaborate on that? Why doesn't this hold true for a 
>> normal file system? If we split the DIO write into multiple BIOs with 
>> zone-append, there is nothing which guarantees the order of the written 
>> data (at least as far as I can see).
> 
> Of course nothing gurantees the order.  But the whole point is that the
> order does not matter.  
> 

The order does not matter at the DIO level since iomap dio end callback will
allow the FS to add an extent mapping the written data using the drive indicated
write location. But that callback is for the entire DIO, not per BIO fragment of
the DIO. So if the BIO fragments of a large DIO get reordered, as Johannes said,
we will get data corruption in the FS extent. No ?
Christoph Hellwig March 25, 2020, 10:01 a.m. UTC | #11
On Wed, Mar 25, 2020 at 09:59:19AM +0000, Damien Le Moal wrote:
> On 2020/03/25 18:48, hch@infradead.org wrote:
> > On Wed, Mar 25, 2020 at 09:45:39AM +0000, Johannes Thumshirn wrote:
> >>
> >> Can you please elaborate on that? Why doesn't this hold true for a 
> >> normal file system? If we split the DIO write into multiple BIOs with 
> >> zone-append, there is nothing which guarantees the order of the written 
> >> data (at least as far as I can see).
> > 
> > Of course nothing gurantees the order.  But the whole point is that the
> > order does not matter.  
> > 
> 
> The order does not matter at the DIO level since iomap dio end callback will
> allow the FS to add an extent mapping the written data using the drive indicated
> write location. But that callback is for the entire DIO, not per BIO fragment of
> the DIO. So if the BIO fragments of a large DIO get reordered, as Johannes said,
> we will get data corruption in the FS extent. No ?

I thought of recording the location in ->iomap_end (and in fact
had a prototype for that), but that is not going to work for AIO of
course.  So yes, we'll need some way to have per-extent completion
callbacks.
Johannes Thumshirn March 25, 2020, 10:15 a.m. UTC | #12
On 25/03/2020 11:01, hch@infradead.org wrote:
> I thought of recording the location in ->iomap_end (and in fact
> had a prototype for that), but that is not going to work for AIO of
> course.  So yes, we'll need some way to have per-extent completion
> callbacks.

OK let's postpone iomap then and handle zone-append directly in zonefs 
for now.
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 23837926c0c5..b3e2aadce72f 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -17,6 +17,7 @@ 
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_ZONE_APPEND	(1 << 27)
 #define IOMAP_DIO_WRITE_FUA	(1 << 28)
 #define IOMAP_DIO_NEED_SYNC	(1 << 29)
 #define IOMAP_DIO_WRITE		(1 << 30)
@@ -39,6 +40,7 @@  struct iomap_dio {
 			struct task_struct	*waiter;
 			struct request_queue	*last_queue;
 			blk_qc_t		cookie;
+			sector_t		sector;
 		} submit;
 
 		/* used for aio completion: */
@@ -151,6 +153,9 @@  static void iomap_dio_bio_end_io(struct bio *bio)
 	if (bio->bi_status)
 		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
 
+	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
+		dio->submit.sector = bio->bi_iter.bi_sector;
+
 	if (atomic_dec_and_test(&dio->ref)) {
 		if (dio->wait_for_completion) {
 			struct task_struct *waiter = dio->submit.waiter;
@@ -194,6 +199,21 @@  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	iomap_dio_submit_bio(dio, iomap, bio);
 }
 
+static sector_t
+iomap_dio_bio_sector(struct iomap_dio *dio, struct iomap *iomap, loff_t pos)
+{
+	sector_t sector = iomap_sector(iomap, pos);
+
+	/*
+	 * For zone append writes, the BIO needs to point at the start of the
+	 * zone to append to.
+	 */
+	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
+		sector = ALIGN_DOWN(sector, bdev_zone_sectors(iomap->bdev));
+
+	return sector;
+}
+
 static loff_t
 iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		struct iomap_dio *dio, struct iomap *iomap)
@@ -204,6 +224,7 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct bio *bio;
 	bool need_zeroout = false;
 	bool use_fua = false;
+	bool zone_append = false;
 	int nr_pages, ret = 0;
 	size_t copied = 0;
 	size_t orig_count;
@@ -235,6 +256,9 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 			use_fua = true;
 	}
 
+	if (dio->flags & IOMAP_DIO_ZONE_APPEND)
+		zone_append = true;
+
 	/*
 	 * Save the original count and trim the iter to just the extent we
 	 * are operating on right now.  The iter will be re-expanded once
@@ -266,12 +290,28 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_iter.bi_sector = iomap_dio_bio_sector(dio, iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
 
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio->bi_opf = REQ_SYNC | REQ_IDLE;
+			if (zone_append)
+				bio->bi_opf |= REQ_OP_ZONE_APPEND;
+			else
+				bio->bi_opf |= REQ_OP_WRITE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+		} else {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
 		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
 		if (unlikely(ret)) {
 			/*
@@ -284,19 +324,10 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 			goto zero_tail;
 		}
 
-		n = bio->bi_iter.bi_size;
-		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+		if (dio->flags & IOMAP_DIO_WRITE)
 			task_io_account_write(n);
-		} else {
-			bio->bi_opf = REQ_OP_READ;
-			if (dio->flags & IOMAP_DIO_DIRTY)
-				bio_set_pages_dirty(bio);
-		}
+
+		n = bio->bi_iter.bi_size;
 
 		dio->size += n;
 		pos += n;
@@ -304,6 +335,15 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
 		iomap_dio_submit_bio(dio, iomap, bio);
+
+		/*
+		 * Issuing multiple BIOs for a large zone append write can
+		 * result in reordering of the write fragments and to data
+		 * corruption. So always stop after the first BIO is issued.
+		 */
+		if (zone_append)
+			break;
+
 	} while (nr_pages);
 
 	/*
@@ -446,6 +486,11 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		flags |= IOMAP_WRITE;
 		dio->flags |= IOMAP_DIO_WRITE;
 
+		if (iocb->ki_flags & IOCB_ZONE_APPEND) {
+			flags |= IOMAP_ZONE_APPEND;
+			dio->flags |= IOMAP_DIO_ZONE_APPEND;
+		}
+
 		/* for data sync or sync, we need sync completion processing */
 		if (iocb->ki_flags & IOCB_DSYNC)
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -516,6 +561,15 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			iov_iter_revert(iter, pos - dio->i_size);
 			break;
 		}
+
+		/*
+		 * Zone append writes cannot be split and be shorted. Break
+		 * here to let the user know instead of sending more IOs which
+		 * could get reordered and corrupt the written data.
+		 */
+		if (flags & IOMAP_ZONE_APPEND)
+			break;
+
 	} while ((count = iov_iter_count(iter)) > 0);
 	blk_finish_plug(&plug);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3cd4fe6b845e..aa4ad705e549 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -314,6 +314,7 @@  enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_ZONE_APPEND	(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b09463dae0d..16c17a79e53d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -68,7 +68,6 @@  struct vm_fault;
  */
 #define IOMAP_F_PRIVATE		0x1000
 
-
 /*
  * Magic value for addr:
  */
@@ -95,6 +94,17 @@  iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE		(1 << 0) /* writing, must allocate blocks */
+#define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
+#define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
+#define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
+#define IOMAP_DIRECT		(1 << 4) /* direct I/O */
+#define IOMAP_NOWAIT		(1 << 5) /* do not block */
+#define IOMAP_ZONE_APPEND	(1 << 6) /* Use zone append for writes */
+
 /*
  * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
  * and page_done will be called for each page written to.  This only applies to
@@ -112,16 +122,6 @@  struct iomap_page_ops {
 			struct page *page, struct iomap *iomap);
 };
 
-/*
- * Flags for iomap_begin / iomap_end.  No flag implies a read.
- */
-#define IOMAP_WRITE		(1 << 0) /* writing, must allocate blocks */
-#define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
-#define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
-#define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
-#define IOMAP_DIRECT		(1 << 4) /* direct I/O */
-#define IOMAP_NOWAIT		(1 << 5) /* do not block */
-
 struct iomap_ops {
 	/*
 	 * Return the existing mapping at pos, or reserve space starting at