diff mbox series

[4/5] ext4: introduce direct IO write code path using iomap infrastructure

Message ID 581c3a2da89991e7ce5862d93dcfb23e1dc8ddc8.1565609891.git.mbobrowski@mbobrowski.org (mailing list archive)
State New, archived
Headers show
Series ext4: direct IO via iomap infrastructure | expand

Commit Message

Matthew Bobrowski Aug. 12, 2019, 12:53 p.m. UTC
This patch introduces a new direct IO write code path implementation
that makes use of the iomap infrastructure.

All direct IO write operations are now passed from the ->write_iter() callback
to the new function ext4_dio_write_iter(). This function is responsible for
calling into iomap infrastructure via iomap_dio_rw(). Snippets of the direct
IO code from within ext4_file_write_iter(), such as checking whether the IO
request is unaligned asynchronous IO, or whether it will ber overwriting
allocated and initialized blocks has been moved out and into
ext4_dio_write_iter().

The block mapping flags that are passed to ext4_map_blocks() from within
ext4_dio_get_block() and friends have effectively been taken out and
introduced within the ext4_iomap_begin(). If ext4_map_blocks() happens to have
instantiated blocks beyond the i_size, then we attempt to place the inode onto
the orphan list. Despite being able to perform i_size extension checking
earlier on in the direct IO code path, it makes most sense to perform this bit
post successful block allocation.

The ->end_io() callback ext4_dio_write_end_io() is responsible for removing
the inode from the orphan list and determining if we should truncate a failed
write in the case of an error. We also convert a range of unwritten extents to
written if IOMAP_DIO_UNWRITTEN is set and perform the necessary
i_size/i_disksize extension if the iocb->ki_pos + dio->size > i_size_read(inode).

In the instance of a short write, we fallback to buffered IO and complete
whatever is left the 'iter'. Any blocks that may have been allocated in
preparation for direct IO will be reused by buffered IO, so there's no issue
with leaving allocated blocks beyond EOF.

Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
---
 fs/ext4/file.c  | 227 ++++++++++++++++++++++++++++++++++++++++----------------
 fs/ext4/inode.c |  42 +++++++++--
 2 files changed, 199 insertions(+), 70 deletions(-)

Comments

Ritesh Harjani Aug. 12, 2019, 5:04 p.m. UTC | #1
On 8/12/19 6:23 PM, Matthew Bobrowski wrote:
> This patch introduces a new direct IO write code path implementation
> that makes use of the iomap infrastructure.
>
> All direct IO write operations are now passed from the ->write_iter() callback
> to the new function ext4_dio_write_iter(). This function is responsible for
> calling into iomap infrastructure via iomap_dio_rw(). Snippets of the direct
> IO code from within ext4_file_write_iter(), such as checking whether the IO
> request is unaligned asynchronous IO, or whether it will ber overwriting
> allocated and initialized blocks has been moved out and into
> ext4_dio_write_iter().
>
> The block mapping flags that are passed to ext4_map_blocks() from within
> ext4_dio_get_block() and friends have effectively been taken out and
> introduced within the ext4_iomap_begin(). If ext4_map_blocks() happens to have
> instantiated blocks beyond the i_size, then we attempt to place the inode onto
> the orphan list. Despite being able to perform i_size extension checking
> earlier on in the direct IO code path, it makes most sense to perform this bit
> post successful block allocation.
>
> The ->end_io() callback ext4_dio_write_end_io() is responsible for removing
> the inode from the orphan list and determining if we should truncate a failed
> write in the case of an error. We also convert a range of unwritten extents to
> written if IOMAP_DIO_UNWRITTEN is set and perform the necessary
> i_size/i_disksize extension if the iocb->ki_pos + dio->size > i_size_read(inode).
>
> In the instance of a short write, we fallback to buffered IO and complete
> whatever is left the 'iter'. Any blocks that may have been allocated in
> preparation for direct IO will be reused by buffered IO, so there's no issue
> with leaving allocated blocks beyond EOF.
>
> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> ---
>   fs/ext4/file.c  | 227 ++++++++++++++++++++++++++++++++++++++++----------------
>   fs/ext4/inode.c |  42 +++++++++--
>   2 files changed, 199 insertions(+), 70 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 7470800c63b7..d74576821676 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -29,6 +29,7 @@
>   #include <linux/pagevec.h>
>   #include <linux/uio.h>
>   #include <linux/mman.h>
> +#include <linux/backing-dev.h>
>   #include "ext4.h"
>   #include "ext4_jbd2.h"
>   #include "xattr.h"
> @@ -218,6 +219,14 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	if (ret <= 0)
>   		return ret;
>   
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
> +	ret = file_update_time(iocb->ki_filp);
> +	if (ret)
> +		return 0;
> +
>   	if (unlikely(IS_IMMUTABLE(inode)))
>   		return -EPERM;
>   
> @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>   	return iov_iter_count(from);
>   }
>   
> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> +					struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EOPNOTSUPP;
> +		inode_lock(inode);
> +	}
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	current->backing_dev_info = inode_to_bdi(inode);
> +	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
> +	current->backing_dev_info = NULL;
> +out:
> +	inode_unlock(inode);
> +	if (likely(ret > 0)) {
> +		iocb->ki_pos += ret;
> +		ret = generic_write_sync(iocb, ret);
> +	}
> +	return ret;
> +}
> +
>   static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
>   				       size_t count)
>   {
> @@ -284,6 +321,128 @@ static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
>   	return ret;
>   }
>   
> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> +				 ssize_t error, unsigned int flags)
> +{
> +	int ret = 0;
> +	handle_t *handle;
> +	loff_t offset = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (error) {
> +		if (offset + size > i_size_read(inode))
> +			ext4_truncate_failed_write(inode);
> +
> +		/*
> +		 * The inode may have been placed onto the orphan list
> +		 * as a result of an extension. However, an error may
> +		 * have been encountered prior to being able to
> +		 * complete the write operation. Perform any necessary
> +		 * clean up in this case.
> +		 */
> +		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +			if (IS_ERR(handle)) {
> +				if (inode->i_nlink)
> +					ext4_orphan_del(NULL, inode);
> +				return PTR_ERR(handle);
> +			}
> +
> +			if (inode->i_nlink)
> +				ext4_orphan_del(handle, inode);
> +			ext4_journal_stop(handle);
> +		}
> +		return error;
> +	}
> +
> +	if (flags & IOMAP_DIO_UNWRITTEN) {
> +		ret = ext4_convert_unwritten_extents(NULL, inode, offset, size);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (offset + size > i_size_read(inode)) {
> +		ret = ext4_handle_inode_extension(inode, offset + size, 0);
> +		if (ret)
> +			return ret;
> +	}
> +	return ret;
> +}
> +
> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	loff_t offset = iocb->ki_pos;
> +	size_t count = iov_iter_count(from);
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	bool extend = false, overwrite = false, unaligned_aio = false;
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +		inode_lock(inode);
> +	}
> +
> +	if (!ext4_dio_checks(inode)) {
> +		inode_unlock(inode);
> +		/*
> +		 * Fallback to buffered IO if the operation on the
> +		 * inode is not supported by direct IO.
> +		 */
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	/*
> +	 * Unaligned direct AIO must be serialized among each other as
> +	 * the zeroing of partial blocks of two competing unaligned
> +	 * AIOs can result in data corruption.
> +	 */
> +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> +	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
> +		unaligned_aio = true;
> +		inode_dio_wait(inode);
> +	}
> +
> +	/*
> +	 * Determine whether the IO operation will overwrite allocated
> +	 * and initialized blocks. If so, check to see whether it is
> +	 * possible to take the dioread_nolock path.
> +	 */
> +	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
> +	    ext4_should_dioread_nolock(inode)) {
> +		overwrite = true;
> +		downgrade_write(&inode->i_rwsem);
> +	}
> +
> +	if (offset + count > i_size_read(inode) ||
> +	    offset + count > EXT4_I(inode)->i_disksize) {
> +		ext4_update_i_disksize(inode, inode->i_size);
> +		extend = true;
> +	}
> +
> +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> +
> +	/*
> +	 * Unaligned direct AIO must be the only IO in flight or else
> +	 * any overlapping aligned IO after unaligned IO might result
> +	 * in data corruption.
> +	 */
> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);

Could you please add explain & add a comment about why we wait in AIO 
DIO case
when extend is true? As I see without iomap code this case was not 
present earlier.


> +
> +	if (ret >= 0 && iov_iter_count(from)) {
> +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
should not we copy code from "__generic_file_write_iter" which does below?

3436                 /*
3437                  * We need to ensure that the page cache pages are 
written to
3438                  * disk and invalidated to preserve the expected 
O_DIRECT
3439                  * semantics.
3440                  */


> +out:
> +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +	return ret;
> +}
> +
>   #ifdef CONFIG_FS_DAX
>   static ssize_t
>   ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> @@ -300,12 +459,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	ret = ext4_write_checks(iocb, from);
>   	if (ret <= 0)
>   		goto out;
> -	ret = file_remove_privs(iocb->ki_filp);
> -	if (ret)
> -		goto out;
> -	ret = file_update_time(iocb->ki_filp);
> -	if (ret)
> -		goto out;
>   
>   	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
>   
> @@ -327,10 +480,6 @@ static ssize_t
>   ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   {
>   	struct inode *inode = file_inode(iocb->ki_filp);
> -	int o_direct = iocb->ki_flags & IOCB_DIRECT;
> -	int unaligned_aio = 0;
> -	int overwrite = 0;
> -	ssize_t ret;
>   
>   	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
>   		return -EIO;
> @@ -339,61 +488,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	if (IS_DAX(inode))
>   		return ext4_dax_write_iter(iocb, from);
>   #endif
> -	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
> -		return -EOPNOTSUPP;
> -
> -	if (!inode_trylock(inode)) {
> -		if (iocb->ki_flags & IOCB_NOWAIT)
> -			return -EAGAIN;
> -		inode_lock(inode);
> -	}
> -
> -	ret = ext4_write_checks(iocb, from);
> -	if (ret <= 0)
> -		goto out;
> -
> -	/*
> -	 * Unaligned direct AIO must be serialized among each other as zeroing
> -	 * of partial blocks of two competing unaligned AIOs can result in data
> -	 * corruption.
> -	 */
> -	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> -	    !is_sync_kiocb(iocb) &&
> -	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
> -		unaligned_aio = 1;
> -		ext4_unwritten_wait(inode);
> -	}
> -
> -	iocb->private = &overwrite;
> -	/* Check whether we do a DIO overwrite or not */
> -	if (o_direct && !unaligned_aio) {
> -		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
> -			if (ext4_should_dioread_nolock(inode))
> -				overwrite = 1;
> -		} else if (iocb->ki_flags & IOCB_NOWAIT) {
> -			ret = -EAGAIN;
> -			goto out;
> -		}
> -	}
> -
> -	ret = __generic_file_write_iter(iocb, from);
> -	/*
> -	 * Unaligned direct AIO must be the only IO in flight. Otherwise
> -	 * overlapping aligned IO after unaligned might result in data
> -	 * corruption.
> -	 */
> -	if (ret == -EIOCBQUEUED && unaligned_aio)
> -		ext4_unwritten_wait(inode);
> -	inode_unlock(inode);
> -
> -	if (ret > 0)
> -		ret = generic_write_sync(iocb, ret);
> -
> -	return ret;
> -
> -out:
> -	inode_unlock(inode);
> -	return ret;
> +	if (iocb->ki_flags & IOCB_DIRECT)
> +		return ext4_dio_write_iter(iocb, from);
> +	return ext4_buffered_write_iter(iocb, from);
>   }
>   
>   #ifdef CONFIG_FS_DAX
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 761ce6286b05..9155a8a6eb0b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3533,8 +3533,38 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		if (IS_ERR(handle))
>   			return PTR_ERR(handle);
>   
> -		ret = ext4_map_blocks(handle, inode, &map,
> -				      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		if (IS_DAX(inode)) {
> +			ret = ext4_map_blocks(handle, inode, &map,
> +					      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		} else {
> +			/*
> +			 * DAX and direct IO are the only two
> +			 * operations currently supported with
> +			 * IOMAP_WRITE.
> +			 */
> +			WARN_ON(!(flags & IOMAP_DIRECT));
> +			if (round_down(offset, i_blocksize(inode)) >=
> +			    i_size_read(inode)) {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_CREATE);
> +			} else if (!ext4_test_inode_flag(inode,
> +							 EXT4_INODE_EXTENTS)) {
> +				/*
> +				 * We cannot fill holes in indirect
> +				 * tree based inodes as that could
> +				 * expose stale data in the case of a
> +				 * crash. Use magic error code to
> +				 * fallback to buffered IO.
> +				 */
> +				ret = ext4_map_blocks(handle, inode, &map, 0);
> +				if (ret == 0)
> +					ret = -ENOTBLK;
> +			} else {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> +			}
> +		}

Could you please check & confirm on below points -
1. Do you see a problem @above in case of *overwrite* with extents mapping?
It will fall into EXT4_GET_BLOCKS_IO_CREATE_EXT case.
So are we piggy backing on the fact that ext4_map_blocks first call 
ext4_ext_map_blocks
with flags & EXT4_GET_BLOCKS_KEEP_SIZE. And so for overwrite case since 
it will return
val > 0 then we will anyway not create any blocks and so we don't need 
to check overwrite
case specifically here?


2. For cases with flags passed is 0 to ext4_map_blocks (overwrite & 
fallocate without extent case),
we need not start the journaling transaction. But in above we are doing 
ext4_journal_start/stop unconditionally
& unnecessarily reserving dio_credits blocks.
We need to take care of that right?


> +
>   		if (ret < 0) {
>   			ext4_journal_stop(handle);
>   			if (ret == -ENOSPC &&
> @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
>   		iomap->addr = IOMAP_NULL_ADDR;
>   	} else {
> -		if (map.m_flags & EXT4_MAP_MAPPED) {
> -			iomap->type = IOMAP_MAPPED;
> -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
>   			iomap->type = IOMAP_UNWRITTEN;
> +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> +			iomap->type = IOMAP_MAPPED;
Maybe a comment as to explaining why checking UNWRITTEN before is 
necessary for others.
So in case of fallocate & DIO write case we may get extent which is both 
unwritten & mapped (right?).
so we need to check if we have an unwritten extent first so that it will 
need the conversion in ->end_io
callback.

>   		} else {
>   			WARN_ON_ONCE(1);
>   			return -EIO;
> @@ -3601,6 +3631,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>   static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>   			  ssize_t written, unsigned flags, struct iomap *iomap)
>   {
> +	if (flags & IOMAP_DIRECT && written == 0)
> +		return -ENOTBLK;
>   	return 0;
>   }
>
Christoph Hellwig Aug. 12, 2019, 5:34 p.m. UTC | #2
> +	if (error) {
> +		if (offset + size > i_size_read(inode))
> +			ext4_truncate_failed_write(inode);
> +
> +		/*
> +		 * The inode may have been placed onto the orphan list
> +		 * as a result of an extension. However, an error may
> +		 * have been encountered prior to being able to
> +		 * complete the write operation. Perform any necessary
> +		 * clean up in this case.
> +		 */
> +		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +			if (IS_ERR(handle)) {
> +				if (inode->i_nlink)
> +					ext4_orphan_del(NULL, inode);
> +				return PTR_ERR(handle);
> +			}
> +
> +			if (inode->i_nlink)
> +				ext4_orphan_del(handle, inode);
> +			ext4_journal_stop(handle);
> +		}
> +		return error;

I'd split this branch into a separate function just to keep the
end_io handler tidy.

> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);
> +
> +	if (ret >= 0 && iov_iter_count(from)) {
> +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +out:
> +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +	return ret;

the ? : expression here is weird.

I'd write this as:

	if (overwrite)
		inode_unlock_shared(inode);
	else
		inode_unlock(inode);

	if (ret >= 0 && iov_iter_count(from))
		return ext4_buffered_write_iter(iocb, from);
	return ret;

and handle the only place we jump to the current out label manually,
as that always does an exclusive unlock anyway.

> +		if (IS_DAX(inode)) {
> +			ret = ext4_map_blocks(handle, inode, &map,
> +					      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		} else {
> +			/*
> +			 * DAX and direct IO are the only two
> +			 * operations currently supported with
> +			 * IOMAP_WRITE.
> +			 */
> +			WARN_ON(!(flags & IOMAP_DIRECT));
> +			if (round_down(offset, i_blocksize(inode)) >=
> +			    i_size_read(inode)) {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_CREATE);
> +			} else if (!ext4_test_inode_flag(inode,
> +							 EXT4_INODE_EXTENTS)) {
> +				/*
> +				 * We cannot fill holes in indirect
> +				 * tree based inodes as that could
> +				 * expose stale data in the case of a
> +				 * crash. Use magic error code to
> +				 * fallback to buffered IO.
> +				 */
> +				ret = ext4_map_blocks(handle, inode, &map, 0);
> +				if (ret == 0)
> +					ret = -ENOTBLK;
> +			} else {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> +			}
> +		}

I think this could be simplified down to something like:

		int flags = 0;

		...

		/*
		 * DAX and direct IO are the only two operations currently
		 * supported with IOMAP_WRITE.
		 */
		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));

		if (IS_DAX(inode))
			flags = EXT4_GET_BLOCKS_CREATE_ZERO;
		else if (round_down(offset, i_blocksize(inode)) >=
				i_size_read(inode)) {
			flags = EXT4_GET_BLOCKS_CREATE;
		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
			flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

		/*
		 * We cannot fill holes in indirect tree based inodes as that
		 * could expose stale data in the case of a crash.  Use the
		 * magic error code to fallback to buffered IO.
		 */
		if (!flags && !ret)
			ret = -ENOTBLK;


> @@ -3601,6 +3631,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>  			  ssize_t written, unsigned flags, struct iomap *iomap)
>  {
> +	if (flags & IOMAP_DIRECT && written == 0)
> +		return -ENOTBLK;

This probably wants a comment, too.  But do we actually ever end up
here?
Matthew Bobrowski Aug. 13, 2019, 10:45 a.m. UTC | #3
On Mon, Aug 12, 2019 at 10:34:03AM -0700, Christoph Hellwig wrote:
> > +	if (error) {
> > +		if (offset + size > i_size_read(inode))
> > +			ext4_truncate_failed_write(inode);
> > +
> > +		/*
> > +		 * The inode may have been placed onto the orphan list
> > +		 * as a result of an extension. However, an error may
> > +		 * have been encountered prior to being able to
> > +		 * complete the write operation. Perform any necessary
> > +		 * clean up in this case.
> > +		 */
> > +		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> > +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> > +			if (IS_ERR(handle)) {
> > +				if (inode->i_nlink)
> > +					ext4_orphan_del(NULL, inode);
> > +				return PTR_ERR(handle);
> > +			}
> > +
> > +			if (inode->i_nlink)
> > +				ext4_orphan_del(handle, inode);
> > +			ext4_journal_stop(handle);
> > +		}
> > +		return error;
> 
> I'd split this branch into a separate function just to keep the
> end_io handler tidy.

Good idea. I'll do that...

> > +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> > +		inode_dio_wait(inode);
> > +
> > +	if (ret >= 0 && iov_iter_count(from)) {
> > +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > +		return ext4_buffered_write_iter(iocb, from);
> > +	}
> > +out:
> > +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > +	return ret;
> 
> the ? : expression here is weird.
> 
> I'd write this as:
> 
> 	if (overwrite)
> 		inode_unlock_shared(inode);
> 	else
> 		inode_unlock(inode);
> 
> 	if (ret >= 0 && iov_iter_count(from))
> 		return ext4_buffered_write_iter(iocb, from);
> 	return ret;
> 
> and handle the only place we jump to the current out label manually,
> as that always does an exclusive unlock anyway.

Yeah, the ternary operators do look weird here and I'd prefer if we
also dropped them. I was at a point where I was trying to clean up
some of the code, but I had been staring at the screen for so long my
brain went numb and couldn't think of how to do this neatly. I'm happy
with this suggestion. :-)

> > +		if (IS_DAX(inode)) {
> > +			ret = ext4_map_blocks(handle, inode, &map,
> > +					      EXT4_GET_BLOCKS_CREATE_ZERO);
> > +		} else {
> > +			/*
> > +			 * DAX and direct IO are the only two
> > +			 * operations currently supported with
> > +			 * IOMAP_WRITE.
> > +			 */
> > +			WARN_ON(!(flags & IOMAP_DIRECT));
> > +			if (round_down(offset, i_blocksize(inode)) >=
> > +			    i_size_read(inode)) {
> > +				ret = ext4_map_blocks(handle, inode, &map,
> > +						      EXT4_GET_BLOCKS_CREATE);
> > +			} else if (!ext4_test_inode_flag(inode,
> > +							 EXT4_INODE_EXTENTS)) {
> > +				/*
> > +				 * We cannot fill holes in indirect
> > +				 * tree based inodes as that could
> > +				 * expose stale data in the case of a
> > +				 * crash. Use magic error code to
> > +				 * fallback to buffered IO.
> > +				 */
> > +				ret = ext4_map_blocks(handle, inode, &map, 0);
> > +				if (ret == 0)
> > +					ret = -ENOTBLK;
> > +			} else {
> > +				ret = ext4_map_blocks(handle, inode, &map,
> > +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> > +			}
> > +		}
> 
> I think this could be simplified down to something like:
> 
> 		int flags = 0;
> 
> 		...
> 
> 		/*
> 		 * DAX and direct IO are the only two operations currently
> 		 * supported with IOMAP_WRITE.
> 		 */
> 		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
> 
> 		if (IS_DAX(inode))
> 			flags = EXT4_GET_BLOCKS_CREATE_ZERO;
> 		else if (round_down(offset, i_blocksize(inode)) >=
> 				i_size_read(inode)) {
> 			flags = EXT4_GET_BLOCKS_CREATE;
> 		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> 			flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
> 
> 		/*
> 		 * We cannot fill holes in indirect tree based inodes as that
> 		 * could expose stale data in the case of a crash.  Use the
> 		 * magic error code to fallback to buffered IO.
> 		 */
> 		if (!flags && !ret)
> 			ret = -ENOTBLK;

This also seems OK to me.

> > @@ -3601,6 +3631,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >  static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
> >  			  ssize_t written, unsigned flags, struct iomap *iomap)
> >  {
> > +	if (flags & IOMAP_DIRECT && written == 0)
> > +		return -ENOTBLK;
> 
> This probably wants a comment, too.  But do we actually ever end up
> here?

Sure, I can append a comment. Also, I don't believe that we can
completely drop the ->iomap_end() callback as hinted in one of your
other comments. The reason I say this is because we still need this to
catch the case where an error an occurs within 'iomap_actor_t'. If
that happens to be, within iomap_dio_rw() we wait for IO completion
before returning and then we fallback to buffered IO to complete the
remainder of the IO. We will also be able to reuse the extent that was
allocated when preparing for direct IO if we do this.

--M
Matthew Bobrowski Aug. 13, 2019, 12:58 p.m. UTC | #4
On Mon, Aug 12, 2019 at 10:34:29PM +0530, RITESH HARJANI wrote:
> > +	if (offset + count > i_size_read(inode) ||
> > +	    offset + count > EXT4_I(inode)->i_disksize) {
> > +		ext4_update_i_disksize(inode, inode->i_size);
> > +		extend = true;
> > +	}
> > +
> > +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> > +
> > +	/*
> > +	 * Unaligned direct AIO must be the only IO in flight or else
> > +	 * any overlapping aligned IO after unaligned IO might result
> > +	 * in data corruption.
> > +	 */
> > +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> > +		inode_dio_wait(inode);
> 
> Could you please add explain & add a comment about why we wait in AIO DIO
> case
> when extend is true? As I see without iomap code this case was not present
> earlier.

Because while using the iomap infrastructure for AIO writes, we return to the
caller prior to invoking the ->end_io() handler. This callback is responsible
for performing the in-core/on-disk inode extension if it is deemed
necessary. If we don't wait in the case of an extend, we run the risk of
loosing inode size consistencies in addition to things leading to possible
data corruption...

> > +
> > +	if (ret >= 0 && iov_iter_count(from)) {
> > +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > +		return ext4_buffered_write_iter(iocb, from);
> > +	}
> should not we copy code from "__generic_file_write_iter" which does below?
> 
> 3436                 /*
> 3437                  * We need to ensure that the page cache pages are
> written to
> 3438                  * disk and invalidated to preserve the expected
> O_DIRECT
> 3439                  * semantics.
> 3440                  */

Hm, I don't see why this would be required seeing as though the page cache
invalidation semantics pre and post write are handled by iomap_dio_rw() and
iomap_dio_complete(). But, I could be completely wrong here, so we may need to
wait for some others to provide comments on this.

> > +			WARN_ON(!(flags & IOMAP_DIRECT));
> > +			if (round_down(offset, i_blocksize(inode)) >=
> > +			    i_size_read(inode)) {
> > +				ret = ext4_map_blocks(handle, inode, &map,
> > +						      EXT4_GET_BLOCKS_CREATE);
> > +			} else if (!ext4_test_inode_flag(inode,
> > +							 EXT4_INODE_EXTENTS)) {
> > +				/*
> > +				 * We cannot fill holes in indirect
> > +				 * tree based inodes as that could
> > +				 * expose stale data in the case of a
> > +				 * crash. Use magic error code to
> > +				 * fallback to buffered IO.
> > +				 */
> > +				ret = ext4_map_blocks(handle, inode, &map, 0);
> > +				if (ret == 0)
> > +					ret = -ENOTBLK;
> > +			} else {
> > +				ret = ext4_map_blocks(handle, inode, &map,
> > +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> > +			}
> > +		}
> 
> Could you please check & confirm on below points -
> 1. Do you see a problem @above in case of *overwrite* with extents mapping?
> It will fall into EXT4_GET_BLOCKS_IO_CREATE_EXT case.
> So are we piggy backing on the fact that ext4_map_blocks first call
> ext4_ext_map_blocks
> with flags & EXT4_GET_BLOCKS_KEEP_SIZE. And so for overwrite case since it
> will return
> val > 0 then we will anyway not create any blocks and so we don't need to
> check overwrite
> case specifically here?
> 
> 
> 2. For cases with flags passed is 0 to ext4_map_blocks (overwrite &
> fallocate without extent case),
> we need not start the journaling transaction. But in above we are doing
> ext4_journal_start/stop unconditionally
> & unnecessarily reserving dio_credits blocks.
> We need to take care of that right?

Hm, I think you raise valid points here.

Jan, do you have any comments on the above? I vaguely remember having a
discussion around dropping the overwrite checks in ext4_iomap_begin() as we're
removing the inode_lock() early on in ext4_dio_write_iter(), so it woudln't be
necessary to do so. But, now that Ritesh mentioned it again I'm thinking it
may actually be required...

> >   		if (ret < 0) {
> >   			ext4_journal_stop(handle);
> >   			if (ret == -ENOSPC &&
> > @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >   		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
> >   		iomap->addr = IOMAP_NULL_ADDR;
> >   	} else {
> > -		if (map.m_flags & EXT4_MAP_MAPPED) {
> > -			iomap->type = IOMAP_MAPPED;
> > -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> >   			iomap->type = IOMAP_UNWRITTEN;
> > +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> > +			iomap->type = IOMAP_MAPPED;
> Maybe a comment as to explaining why checking UNWRITTEN before is necessary
> for others.
> So in case of fallocate & DIO write case we may get extent which is both
> unwritten & mapped (right?).
> so we need to check if we have an unwritten extent first so that it will
> need the conversion in ->end_io
> callback.

Yes, that is essentially correct.

--M
Darrick J. Wong Aug. 13, 2019, 2:35 p.m. UTC | #5
On Tue, Aug 13, 2019 at 10:58:42PM +1000, Matthew Bobrowski wrote:
> On Mon, Aug 12, 2019 at 10:34:29PM +0530, RITESH HARJANI wrote:
> > > +	if (offset + count > i_size_read(inode) ||
> > > +	    offset + count > EXT4_I(inode)->i_disksize) {
> > > +		ext4_update_i_disksize(inode, inode->i_size);
> > > +		extend = true;
> > > +	}
> > > +
> > > +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> > > +
> > > +	/*
> > > +	 * Unaligned direct AIO must be the only IO in flight or else
> > > +	 * any overlapping aligned IO after unaligned IO might result
> > > +	 * in data corruption.
> > > +	 */
> > > +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> > > +		inode_dio_wait(inode);
> > 
> > Could you please add explain & add a comment about why we wait in AIO DIO
> > case
> > when extend is true? As I see without iomap code this case was not present
> > earlier.
> 
> Because while using the iomap infrastructure for AIO writes, we return to the
> caller prior to invoking the ->end_io() handler. This callback is responsible
> for performing the in-core/on-disk inode extension if it is deemed
> necessary. If we don't wait in the case of an extend, we run the risk of
> loosing inode size consistencies in addition to things leading to possible
> data corruption...
> 
> > > +
> > > +	if (ret >= 0 && iov_iter_count(from)) {
> > > +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > > +		return ext4_buffered_write_iter(iocb, from);
> > > +	}
> > should not we copy code from "__generic_file_write_iter" which does below?
> > 
> > 3436                 /*
> > 3437                  * We need to ensure that the page cache pages are
> > written to
> > 3438                  * disk and invalidated to preserve the expected
> > O_DIRECT
> > 3439                  * semantics.
> > 3440                  */
> 
> Hm, I don't see why this would be required seeing as though the page cache
> invalidation semantics pre and post write are handled by iomap_dio_rw() and
> iomap_dio_complete(). But, I could be completely wrong here, so we may need to
> wait for some others to provide comments on this.

iomap_dio_rw is supposed to zap the page cache before the write and
again afterwards (and whine if someone is racing buffered and direct
writes to the same file location), so ext4 shouldn't need to do that
itself.

--D

> > > +			WARN_ON(!(flags & IOMAP_DIRECT));
> > > +			if (round_down(offset, i_blocksize(inode)) >=
> > > +			    i_size_read(inode)) {
> > > +				ret = ext4_map_blocks(handle, inode, &map,
> > > +						      EXT4_GET_BLOCKS_CREATE);
> > > +			} else if (!ext4_test_inode_flag(inode,
> > > +							 EXT4_INODE_EXTENTS)) {
> > > +				/*
> > > +				 * We cannot fill holes in indirect
> > > +				 * tree based inodes as that could
> > > +				 * expose stale data in the case of a
> > > +				 * crash. Use magic error code to
> > > +				 * fallback to buffered IO.
> > > +				 */
> > > +				ret = ext4_map_blocks(handle, inode, &map, 0);
> > > +				if (ret == 0)
> > > +					ret = -ENOTBLK;
> > > +			} else {
> > > +				ret = ext4_map_blocks(handle, inode, &map,
> > > +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> > > +			}
> > > +		}
> > 
> > Could you please check & confirm on below points -
> > 1. Do you see a problem @above in case of *overwrite* with extents mapping?
> > It will fall into EXT4_GET_BLOCKS_IO_CREATE_EXT case.
> > So are we piggy backing on the fact that ext4_map_blocks first call
> > ext4_ext_map_blocks
> > with flags & EXT4_GET_BLOCKS_KEEP_SIZE. And so for overwrite case since it
> > will return
> > val > 0 then we will anyway not create any blocks and so we don't need to
> > check overwrite
> > case specifically here?
> > 
> > 
> > 2. For cases with flags passed is 0 to ext4_map_blocks (overwrite &
> > fallocate without extent case),
> > we need not start the journaling transaction. But in above we are doing
> > ext4_journal_start/stop unconditionally
> > & unnecessarily reserving dio_credits blocks.
> > We need to take care of that right?
> 
> Hm, I think you raise valid points here.
> 
> Jan, do you have any comments on the above? I vaguely remember having a
> discussion around dropping the overwrite checks in ext4_iomap_begin() as we're
> removing the inode_lock() early on in ext4_dio_write_iter(), so it woudln't be
> necessary to do so. But, now that Ritesh mentioned it again I'm thinking it
> may actually be required...
> 
> > >   		if (ret < 0) {
> > >   			ext4_journal_stop(handle);
> > >   			if (ret == -ENOSPC &&
> > > @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > >   		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
> > >   		iomap->addr = IOMAP_NULL_ADDR;
> > >   	} else {
> > > -		if (map.m_flags & EXT4_MAP_MAPPED) {
> > > -			iomap->type = IOMAP_MAPPED;
> > > -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > > +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > >   			iomap->type = IOMAP_UNWRITTEN;
> > > +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> > > +			iomap->type = IOMAP_MAPPED;
> > Maybe a comment as to explaining why checking UNWRITTEN before is necessary
> > for others.
> > So in case of fallocate & DIO write case we may get extent which is both
> > unwritten & mapped (right?).
> > so we need to check if we have an unwritten extent first so that it will
> > need the conversion in ->end_io
> > callback.
> 
> Yes, that is essentially correct.
> 
> --M
Matthew Bobrowski Aug. 14, 2019, 9:51 a.m. UTC | #6
On Tue, Aug 13, 2019 at 07:35:40AM -0700, Darrick J. Wong wrote:
> On Tue, Aug 13, 2019 at 10:58:42PM +1000, Matthew Bobrowski wrote:
> > On Mon, Aug 12, 2019 at 10:34:29PM +0530, RITESH HARJANI wrote:
> > > > +
> > > > +	if (ret >= 0 && iov_iter_count(from)) {
> > > > +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > > > +		return ext4_buffered_write_iter(iocb, from);
> > > > +	}
> > > should not we copy code from "__generic_file_write_iter" which does below?
> > > 
> > > 3436                 /*
> > > 3437                  * We need to ensure that the page cache pages are
> > > written to
> > > 3438                  * disk and invalidated to preserve the expected
> > > O_DIRECT
> > > 3439                  * semantics.
> > > 3440                  */
> > 
> > Hm, I don't see why this would be required seeing as though the page cache
> > invalidation semantics pre and post write are handled by iomap_dio_rw() and
> > iomap_dio_complete(). But, I could be completely wrong here, so we may need to
> > wait for some others to provide comments on this.
> 
> iomap_dio_rw is supposed to zap the page cache before the write and
> again afterwards (and whine if someone is racing buffered and direct
> writes to the same file location), so ext4 shouldn't need to do that
> itself.

Thanks for confirming Darrick! I thought that was the case.

--M
Jan Kara Aug. 28, 2019, 8:26 p.m. UTC | #7
On Mon 12-08-19 22:53:26, Matthew Bobrowski wrote:
> This patch introduces a new direct IO write code path implementation
> that makes use of the iomap infrastructure.
> 
> All direct IO write operations are now passed from the ->write_iter() callback
> to the new function ext4_dio_write_iter(). This function is responsible for
> calling into iomap infrastructure via iomap_dio_rw(). Snippets of the direct
> IO code from within ext4_file_write_iter(), such as checking whether the IO
> request is unaligned asynchronous IO, or whether it will ber overwriting
> allocated and initialized blocks has been moved out and into
> ext4_dio_write_iter().
> 
> The block mapping flags that are passed to ext4_map_blocks() from within
> ext4_dio_get_block() and friends have effectively been taken out and
> introduced within the ext4_iomap_begin(). If ext4_map_blocks() happens to have
> instantiated blocks beyond the i_size, then we attempt to place the inode onto
> the orphan list. Despite being able to perform i_size extension checking
> earlier on in the direct IO code path, it makes most sense to perform this bit
> post successful block allocation.
> 
> The ->end_io() callback ext4_dio_write_end_io() is responsible for removing
> the inode from the orphan list and determining if we should truncate a failed
> write in the case of an error. We also convert a range of unwritten extents to
> written if IOMAP_DIO_UNWRITTEN is set and perform the necessary
> i_size/i_disksize extension if the iocb->ki_pos + dio->size > i_size_read(inode).
> 
> In the instance of a short write, we fallback to buffered IO and complete
> whatever is left the 'iter'. Any blocks that may have been allocated in
> preparation for direct IO will be reused by buffered IO, so there's no issue
> with leaving allocated blocks beyond EOF.
> 
> Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> ---
>  fs/ext4/file.c  | 227 ++++++++++++++++++++++++++++++++++++++++----------------
>  fs/ext4/inode.c |  42 +++++++++--
>  2 files changed, 199 insertions(+), 70 deletions(-)

Overall this is very nice. Some smaller comments below.

> @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	return iov_iter_count(from);
>  }
>  
> +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> +					struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EOPNOTSUPP;
> +		inode_lock(inode);
> +	}

Currently there's no support for IOCB_NOWAIT for buffered IO so you can
replace this with "inode_lock(inode)".

> @@ -284,6 +321,128 @@ static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
>  	return ret;
>  }
>  

I'd mention here that for cases where inode size is extended,
ext4_dio_write_iter() waits for DIO to complete and thus we are protected
by inode_lock in that case.

> +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> +				 ssize_t error, unsigned int flags)
> +{
> +	int ret = 0;
> +	handle_t *handle;
> +	loff_t offset = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (error) {
> +		if (offset + size > i_size_read(inode))
> +			ext4_truncate_failed_write(inode);
> +
> +		/*
> +		 * The inode may have been placed onto the orphan list
> +		 * as a result of an extension. However, an error may
> +		 * have been encountered prior to being able to
> +		 * complete the write operation. Perform any necessary
> +		 * clean up in this case.
> +		 */
> +		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +			if (IS_ERR(handle)) {
> +				if (inode->i_nlink)
> +					ext4_orphan_del(NULL, inode);
> +				return PTR_ERR(handle);
> +			}
> +
> +			if (inode->i_nlink)
> +				ext4_orphan_del(handle, inode);
> +			ext4_journal_stop(handle);
> +		}
> +		return error;
> +	}
> +
> +	if (flags & IOMAP_DIO_UNWRITTEN) {
> +		ret = ext4_convert_unwritten_extents(NULL, inode, offset, size);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (offset + size > i_size_read(inode)) {
> +		ret = ext4_handle_inode_extension(inode, offset + size, 0);
> +		if (ret)
> +			return ret;
> +	}
> +	return ret;
> +}
> +
> +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	ssize_t ret;
> +	loff_t offset = iocb->ki_pos;
> +	size_t count = iov_iter_count(from);
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	bool extend = false, overwrite = false, unaligned_aio = false;
> +
> +	if (!inode_trylock(inode)) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EAGAIN;
> +		inode_lock(inode);
> +	}
> +
> +	if (!ext4_dio_checks(inode)) {
> +		inode_unlock(inode);
> +		/*
> +		 * Fallback to buffered IO if the operation on the
> +		 * inode is not supported by direct IO.
> +		 */
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +
> +	ret = ext4_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	/*
> +	 * Unaligned direct AIO must be serialized among each other as
> +	 * the zeroing of partial blocks of two competing unaligned
> +	 * AIOs can result in data corruption.
> +	 */
> +	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
> +	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
> +		unaligned_aio = true;
> +		inode_dio_wait(inode);
> +	}
> +
> +	/*
> +	 * Determine whether the IO operation will overwrite allocated
> +	 * and initialized blocks. If so, check to see whether it is
> +	 * possible to take the dioread_nolock path.
> +	 */
> +	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
> +	    ext4_should_dioread_nolock(inode)) {
> +		overwrite = true;
> +		downgrade_write(&inode->i_rwsem);
> +	}
> +
> +	if (offset + count > i_size_read(inode) ||
> +	    offset + count > EXT4_I(inode)->i_disksize) {
> +		ext4_update_i_disksize(inode, inode->i_size);
> +		extend = true;
> +	}
> +
> +	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
> +
> +	/*
> +	 * Unaligned direct AIO must be the only IO in flight or else
> +	 * any overlapping aligned IO after unaligned IO might result
> +	 * in data corruption.
> +	 */

Here I'd expand the comment to explain that we wait in case inode is
extended so that inode extension in ext4_dio_write_end_io() is properly
covered by inode_lock.

> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);
> +
> +	if (ret >= 0 && iov_iter_count(from)) {
> +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +out:
> +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_FS_DAX
>  static ssize_t
>  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)

...

> @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
>  		iomap->addr = IOMAP_NULL_ADDR;
>  	} else {
> -		if (map.m_flags & EXT4_MAP_MAPPED) {
> -			iomap->type = IOMAP_MAPPED;
> -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
>  			iomap->type = IOMAP_UNWRITTEN;
> +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> +			iomap->type = IOMAP_MAPPED;
>  		} else {
>  			WARN_ON_ONCE(1);
>  			return -EIO;

Possibly this hunk should go into a separate patch (since this is not
directly related with iomap conversion) with a changelog / comment
explaining why we need to check EXT4_MAP_UNWRITTEN first.

								Honza
Dave Chinner Aug. 28, 2019, 10:32 p.m. UTC | #8
On Wed, Aug 28, 2019 at 10:26:19PM +0200, Jan Kara wrote:
> On Mon 12-08-19 22:53:26, Matthew Bobrowski wrote:
> > This patch introduces a new direct IO write code path implementation
> > that makes use of the iomap infrastructure.
> > 
> > All direct IO write operations are now passed from the ->write_iter() callback
> > to the new function ext4_dio_write_iter(). This function is responsible for
> > calling into iomap infrastructure via iomap_dio_rw(). Snippets of the direct
> > IO code from within ext4_file_write_iter(), such as checking whether the IO
> > request is unaligned asynchronous IO, or whether it will ber overwriting
> > allocated and initialized blocks has been moved out and into
> > ext4_dio_write_iter().
> > 
> > The block mapping flags that are passed to ext4_map_blocks() from within
> > ext4_dio_get_block() and friends have effectively been taken out and
> > introduced within the ext4_iomap_begin(). If ext4_map_blocks() happens to have
> > instantiated blocks beyond the i_size, then we attempt to place the inode onto
> > the orphan list. Despite being able to perform i_size extension checking
> > earlier on in the direct IO code path, it makes most sense to perform this bit
> > post successful block allocation.
> > 
> > The ->end_io() callback ext4_dio_write_end_io() is responsible for removing
> > the inode from the orphan list and determining if we should truncate a failed
> > write in the case of an error. We also convert a range of unwritten extents to
> > written if IOMAP_DIO_UNWRITTEN is set and perform the necessary
> > i_size/i_disksize extension if the iocb->ki_pos + dio->size > i_size_read(inode).
> > 
> > In the instance of a short write, we fallback to buffered IO and complete
> > whatever is left the 'iter'. Any blocks that may have been allocated in
> > preparation for direct IO will be reused by buffered IO, so there's no issue
> > with leaving allocated blocks beyond EOF.
> > 
> > Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> > ---
> >  fs/ext4/file.c  | 227 ++++++++++++++++++++++++++++++++++++++++----------------
> >  fs/ext4/inode.c |  42 +++++++++--
> >  2 files changed, 199 insertions(+), 70 deletions(-)
> 
> Overall this is very nice. Some smaller comments below.
> 
> > @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  	return iov_iter_count(from);
> >  }
> >  
> > +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> > +					struct iov_iter *from)
> > +{
> > +	ssize_t ret;
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +
> > +	if (!inode_trylock(inode)) {
> > +		if (iocb->ki_flags & IOCB_NOWAIT)
> > +			return -EOPNOTSUPP;
> > +		inode_lock(inode);
> > +	}
> 
> Currently there's no support for IOCB_NOWAIT for buffered IO so you can
> replace this with "inode_lock(inode)".

IOCB_NOWAIT is supported for buffered reads. It is not supported on
buffered writes (as yet), so this should return EOPNOTSUPP if
IOCB_NOWAIT is set, regardless of whether the lock can be grabbed or
not.

Cheers,

Dave.
Jan Kara Aug. 29, 2019, 8:03 a.m. UTC | #9
On Thu 29-08-19 08:32:18, Dave Chinner wrote:
> On Wed, Aug 28, 2019 at 10:26:19PM +0200, Jan Kara wrote:
> > On Mon 12-08-19 22:53:26, Matthew Bobrowski wrote:
> > > This patch introduces a new direct IO write code path implementation
> > > that makes use of the iomap infrastructure.
> > > 
> > > All direct IO write operations are now passed from the ->write_iter() callback
> > > to the new function ext4_dio_write_iter(). This function is responsible for
> > > calling into iomap infrastructure via iomap_dio_rw(). Snippets of the direct
> > > IO code from within ext4_file_write_iter(), such as checking whether the IO
> > > request is unaligned asynchronous IO, or whether it will ber overwriting
> > > allocated and initialized blocks has been moved out and into
> > > ext4_dio_write_iter().
> > > 
> > > The block mapping flags that are passed to ext4_map_blocks() from within
> > > ext4_dio_get_block() and friends have effectively been taken out and
> > > introduced within the ext4_iomap_begin(). If ext4_map_blocks() happens to have
> > > instantiated blocks beyond the i_size, then we attempt to place the inode onto
> > > the orphan list. Despite being able to perform i_size extension checking
> > > earlier on in the direct IO code path, it makes most sense to perform this bit
> > > post successful block allocation.
> > > 
> > > The ->end_io() callback ext4_dio_write_end_io() is responsible for removing
> > > the inode from the orphan list and determining if we should truncate a failed
> > > write in the case of an error. We also convert a range of unwritten extents to
> > > written if IOMAP_DIO_UNWRITTEN is set and perform the necessary
> > > i_size/i_disksize extension if the iocb->ki_pos + dio->size > i_size_read(inode).
> > > 
> > > In the instance of a short write, we fallback to buffered IO and complete
> > > whatever is left the 'iter'. Any blocks that may have been allocated in
> > > preparation for direct IO will be reused by buffered IO, so there's no issue
> > > with leaving allocated blocks beyond EOF.
> > > 
> > > Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org>
> > > ---
> > >  fs/ext4/file.c  | 227 ++++++++++++++++++++++++++++++++++++++++----------------
> > >  fs/ext4/inode.c |  42 +++++++++--
> > >  2 files changed, 199 insertions(+), 70 deletions(-)
> > 
> > Overall this is very nice. Some smaller comments below.
> > 
> > > @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > >  	return iov_iter_count(from);
> > >  }
> > >  
> > > +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> > > +					struct iov_iter *from)
> > > +{
> > > +	ssize_t ret;
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +
> > > +	if (!inode_trylock(inode)) {
> > > +		if (iocb->ki_flags & IOCB_NOWAIT)
> > > +			return -EOPNOTSUPP;
> > > +		inode_lock(inode);
> > > +	}
> > 
> > Currently there's no support for IOCB_NOWAIT for buffered IO so you can
> > replace this with "inode_lock(inode)".
> 
> IOCB_NOWAIT is supported for buffered reads. It is not supported on
> buffered writes (as yet), so this should return EOPNOTSUPP if
> IOCB_NOWAIT is set, regardless of whether the lock can be grabbed or
> not.

Yeah, right. Thanks for correcting me.

								Honza
Matthew Bobrowski Aug. 29, 2019, 11:45 a.m. UTC | #10
On Wed, Aug 28, 2019 at 10:26:19PM +0200, Jan Kara wrote:
> On Mon 12-08-19 22:53:26, Matthew Bobrowski wrote:
> Overall this is very nice. Some smaller comments below.

Awesome, thanks for the review Jan!

> > @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> >  	return iov_iter_count(from);
> >  }
> >  
> > +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> > +					struct iov_iter *from)
> > +{
> > +	ssize_t ret;
> > +	struct inode *inode = file_inode(iocb->ki_filp);
> > +
> > +	if (!inode_trylock(inode)) {
> > +		if (iocb->ki_flags & IOCB_NOWAIT)
> > +			return -EOPNOTSUPP;
> > +		inode_lock(inode);
> > +	}
> 
> Currently there's no support for IOCB_NOWAIT for buffered IO so you can
> replace this with "inode_lock(inode)".

Noted. I've also taken into consideration what Dave mentioned in the
other thread around explicitly checking for IOCB_NOWAIT and returning
EOPTNOTSUPP irrespective whether we can acquire the lock or not.

> > @@ -284,6 +321,128 @@ static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
> >  	return ret;
> >  }
> >  
> 
> I'd mention here that for cases where inode size is extended,
> ext4_dio_write_iter() waits for DIO to complete and thus we are protected
> by inode_lock in that case.

Easy.

> > +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
> > +				 ssize_t error, unsigned int flags)
> 
> Here I'd expand the comment to explain that we wait in case inode is
> extended so that inode extension in ext4_dio_write_end_io() is properly
> covered by inode_lock.
>

Easy.

> > +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> > +		inode_dio_wait(inode);
> > +
> > +	if (ret >= 0 && iov_iter_count(from)) {
> > +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > +		return ext4_buffered_write_iter(iocb, from);
> > +	}
> > +out:
> > +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> > +	return ret;
> > +}
> > +
> >  #ifdef CONFIG_FS_DAX
> >  static ssize_t
> >  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> 
> ...
> 
> > @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >  		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
> >  		iomap->addr = IOMAP_NULL_ADDR;
> >  	} else {
> > -		if (map.m_flags & EXT4_MAP_MAPPED) {
> > -			iomap->type = IOMAP_MAPPED;
> > -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> >  			iomap->type = IOMAP_UNWRITTEN;
> > +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> > +			iomap->type = IOMAP_MAPPED;
> >  		} else {
> >  			WARN_ON_ONCE(1);
> >  			return -EIO;
> 
> Possibly this hunk should go into a separate patch (since this is not
> directly related with iomap conversion) with a changelog / comment
> explaining why we need to check EXT4_MAP_UNWRITTEN first.

But wouldn't doing so break bisection? Seeing as though we needed to
change this statement specifically to accommodate for the weirdness
being returned from ext4_map_blocks()? i.e. map.m_flags being set to
either of the following:

	- (EXT4_MAP_NEW | EXT4_MAP_MAPPED)
        or
        - (EXT4_MAP_NEW | EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)

So, if we left the statement in its original form, we'd allocate
unwritten extents but never actually get around to converting them in
ext4_dio_write_end_io() as IOMAP_DIO_UNWRITTEN would never be set?

--M
Matthew Bobrowski Aug. 29, 2019, 11:47 a.m. UTC | #11
On Thu, Aug 29, 2019 at 08:32:18AM +1000, Dave Chinner wrote:
> On Wed, Aug 28, 2019 at 10:26:19PM +0200, Jan Kara wrote:
> > On Mon 12-08-19 22:53:26, Matthew Bobrowski wrote:
> > > @@ -235,6 +244,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
> > >  	return iov_iter_count(from);
> > >  }
> > >  
> > > +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
> > > +					struct iov_iter *from)
> > > +{
> > > +	ssize_t ret;
> > > +	struct inode *inode = file_inode(iocb->ki_filp);
> > > +
> > > +	if (!inode_trylock(inode)) {
> > > +		if (iocb->ki_flags & IOCB_NOWAIT)
> > > +			return -EOPNOTSUPP;
> > > +		inode_lock(inode);
> > > +	}
> > 
> > Currently there's no support for IOCB_NOWAIT for buffered IO so you can
> > replace this with "inode_lock(inode)".
> 
> IOCB_NOWAIT is supported for buffered reads. It is not supported on
> buffered writes (as yet), so this should return EOPNOTSUPP if
> IOCB_NOWAIT is set, regardless of whether the lock can be grabbed or
> not.

Noted! Thank you Dave. ;-)

--M
Jan Kara Aug. 29, 2019, 12:38 p.m. UTC | #12
On Thu 29-08-19 21:45:17, Matthew Bobrowski wrote:
> > > @@ -3581,10 +3611,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > >  		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
> > >  		iomap->addr = IOMAP_NULL_ADDR;
> > >  	} else {
> > > -		if (map.m_flags & EXT4_MAP_MAPPED) {
> > > -			iomap->type = IOMAP_MAPPED;
> > > -		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > > +		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> > >  			iomap->type = IOMAP_UNWRITTEN;
> > > +		} else if (map.m_flags & EXT4_MAP_MAPPED) {
> > > +			iomap->type = IOMAP_MAPPED;
> > >  		} else {
> > >  			WARN_ON_ONCE(1);
> > >  			return -EIO;
> > 
> > Possibly this hunk should go into a separate patch (since this is not
> > directly related with iomap conversion) with a changelog / comment
> > explaining why we need to check EXT4_MAP_UNWRITTEN first.
> 
> But wouldn't doing so break bisection? Seeing as though we needed to
> change this statement specifically to accommodate for the weirdness
> being returned from ext4_map_blocks()? i.e. map.m_flags being set to
> either of the following:
> 
> 	- (EXT4_MAP_NEW | EXT4_MAP_MAPPED)
>         or
>         - (EXT4_MAP_NEW | EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)
> 
> So, if we left the statement in its original form, we'd allocate
> unwritten extents but never actually get around to converting them in
> ext4_dio_write_end_io() as IOMAP_DIO_UNWRITTEN would never be set?

By splitting into a separate patch, I meant that patch would go before this
one. Original code in ext4_iomap_begin() never called ext4_map_blocks()
with a set of flags that can return with both EXT4_MAP_MAPPED and
EXT4_MAP_UNWRITTEN set so that patch would be a no-op but would fix that
landmine you tripped over.

								Honza
diff mbox series

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7470800c63b7..d74576821676 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,6 +29,7 @@ 
 #include <linux/pagevec.h>
 #include <linux/uio.h>
 #include <linux/mman.h>
+#include <linux/backing-dev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -218,6 +219,14 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	if (ret <= 0)
 		return ret;
 
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		return 0;
+
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		return 0;
+
 	if (unlikely(IS_IMMUTABLE(inode)))
 		return -EPERM;
 
@@ -235,6 +244,34 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return iov_iter_count(from);
 }
 
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+					struct iov_iter *from)
+{
+	ssize_t ret;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (!inode_trylock(inode)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EOPNOTSUPP;
+		inode_lock(inode);
+	}
+
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	current->backing_dev_info = inode_to_bdi(inode);
+	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+	current->backing_dev_info = NULL;
+out:
+	inode_unlock(inode);
+	if (likely(ret > 0)) {
+		iocb->ki_pos += ret;
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
+}
+
 static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
 				       size_t count)
 {
@@ -284,6 +321,128 @@  static int ext4_handle_inode_extension(struct inode *inode, loff_t size,
 	return ret;
 }
 
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+				 ssize_t error, unsigned int flags)
+{
+	int ret = 0;
+	handle_t *handle;
+	loff_t offset = iocb->ki_pos;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (error) {
+		if (offset + size > i_size_read(inode))
+			ext4_truncate_failed_write(inode);
+
+		/*
+		 * The inode may have been placed onto the orphan list
+		 * as a result of an extension. However, an error may
+		 * have been encountered prior to being able to
+		 * complete the write operation. Perform any necessary
+		 * clean up in this case.
+		 */
+		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+			if (IS_ERR(handle)) {
+				if (inode->i_nlink)
+					ext4_orphan_del(NULL, inode);
+				return PTR_ERR(handle);
+			}
+
+			if (inode->i_nlink)
+				ext4_orphan_del(handle, inode);
+			ext4_journal_stop(handle);
+		}
+		return error;
+	}
+
+	if (flags & IOMAP_DIO_UNWRITTEN) {
+		ret = ext4_convert_unwritten_extents(NULL, inode, offset, size);
+		if (ret)
+			return ret;
+	}
+
+	if (offset + size > i_size_read(inode)) {
+		ret = ext4_handle_inode_extension(inode, offset + size, 0);
+		if (ret)
+			return ret;
+	}
+	return ret;
+}
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t ret;
+	loff_t offset = iocb->ki_pos;
+	size_t count = iov_iter_count(from);
+	struct inode *inode = file_inode(iocb->ki_filp);
+	bool extend = false, overwrite = false, unaligned_aio = false;
+
+	if (!inode_trylock(inode)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		inode_lock(inode);
+	}
+
+	if (!ext4_dio_checks(inode)) {
+		inode_unlock(inode);
+		/*
+		 * Fallback to buffered IO if the operation on the
+		 * inode is not supported by direct IO.
+		 */
+		return ext4_buffered_write_iter(iocb, from);
+	}
+
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	/*
+	 * Unaligned direct AIO must be serialized among each other as
+	 * the zeroing of partial blocks of two competing unaligned
+	 * AIOs can result in data corruption.
+	 */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+		unaligned_aio = true;
+		inode_dio_wait(inode);
+	}
+
+	/*
+	 * Determine whether the IO operation will overwrite allocated
+	 * and initialized blocks. If so, check to see whether it is
+	 * possible to take the dioread_nolock path.
+	 */
+	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+	    ext4_should_dioread_nolock(inode)) {
+		overwrite = true;
+		downgrade_write(&inode->i_rwsem);
+	}
+
+	if (offset + count > i_size_read(inode) ||
+	    offset + count > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, inode->i_size);
+		extend = true;
+	}
+
+	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
+
+	/*
+	 * Unaligned direct AIO must be the only IO in flight or else
+	 * any overlapping aligned IO after unaligned IO might result
+	 * in data corruption.
+	 */
+	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
+		inode_dio_wait(inode);
+
+	if (ret >= 0 && iov_iter_count(from)) {
+		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
+		return ext4_buffered_write_iter(iocb, from);
+	}
+out:
+	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
+	return ret;
+}
+
 #ifdef CONFIG_FS_DAX
 static ssize_t
 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -300,12 +459,6 @@  ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
-	ret = file_remove_privs(iocb->ki_filp);
-	if (ret)
-		goto out;
-	ret = file_update_time(iocb->ki_filp);
-	if (ret)
-		goto out;
 
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 
@@ -327,10 +480,6 @@  static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
-	int o_direct = iocb->ki_flags & IOCB_DIRECT;
-	int unaligned_aio = 0;
-	int overwrite = 0;
-	ssize_t ret;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
@@ -339,61 +488,9 @@  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
 #endif
-	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
-		return -EOPNOTSUPP;
-
-	if (!inode_trylock(inode)) {
-		if (iocb->ki_flags & IOCB_NOWAIT)
-			return -EAGAIN;
-		inode_lock(inode);
-	}
-
-	ret = ext4_write_checks(iocb, from);
-	if (ret <= 0)
-		goto out;
-
-	/*
-	 * Unaligned direct AIO must be serialized among each other as zeroing
-	 * of partial blocks of two competing unaligned AIOs can result in data
-	 * corruption.
-	 */
-	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-	    !is_sync_kiocb(iocb) &&
-	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
-		unaligned_aio = 1;
-		ext4_unwritten_wait(inode);
-	}
-
-	iocb->private = &overwrite;
-	/* Check whether we do a DIO overwrite or not */
-	if (o_direct && !unaligned_aio) {
-		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-			if (ext4_should_dioread_nolock(inode))
-				overwrite = 1;
-		} else if (iocb->ki_flags & IOCB_NOWAIT) {
-			ret = -EAGAIN;
-			goto out;
-		}
-	}
-
-	ret = __generic_file_write_iter(iocb, from);
-	/*
-	 * Unaligned direct AIO must be the only IO in flight. Otherwise
-	 * overlapping aligned IO after unaligned might result in data
-	 * corruption.
-	 */
-	if (ret == -EIOCBQUEUED && unaligned_aio)
-		ext4_unwritten_wait(inode);
-	inode_unlock(inode);
-
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-
-	return ret;
-
-out:
-	inode_unlock(inode);
-	return ret;
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return ext4_dio_write_iter(iocb, from);
+	return ext4_buffered_write_iter(iocb, from);
 }
 
 #ifdef CONFIG_FS_DAX
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 761ce6286b05..9155a8a6eb0b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3533,8 +3533,38 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		if (IS_ERR(handle))
 			return PTR_ERR(handle);
 
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (IS_DAX(inode)) {
+			ret = ext4_map_blocks(handle, inode, &map,
+					      EXT4_GET_BLOCKS_CREATE_ZERO);
+		} else {
+			/*
+			 * DAX and direct IO are the only two
+			 * operations currently supported with
+			 * IOMAP_WRITE.
+			 */
+			WARN_ON(!(flags & IOMAP_DIRECT));
+			if (round_down(offset, i_blocksize(inode)) >=
+			    i_size_read(inode)) {
+				ret = ext4_map_blocks(handle, inode, &map,
+						      EXT4_GET_BLOCKS_CREATE);
+			} else if (!ext4_test_inode_flag(inode,
+							 EXT4_INODE_EXTENTS)) {
+				/*
+				 * We cannot fill holes in indirect
+				 * tree based inodes as that could
+				 * expose stale data in the case of a
+				 * crash. Use magic error code to
+				 * fallback to buffered IO.
+				 */
+				ret = ext4_map_blocks(handle, inode, &map, 0);
+				if (ret == 0)
+					ret = -ENOTBLK;
+			} else {
+				ret = ext4_map_blocks(handle, inode, &map,
+						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
+			}
+		}
+
 		if (ret < 0) {
 			ext4_journal_stop(handle);
 			if (ret == -ENOSPC &&
@@ -3581,10 +3611,10 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
 		iomap->addr = IOMAP_NULL_ADDR;
 	} else {
-		if (map.m_flags & EXT4_MAP_MAPPED) {
-			iomap->type = IOMAP_MAPPED;
-		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
 			iomap->type = IOMAP_UNWRITTEN;
+		} else if (map.m_flags & EXT4_MAP_MAPPED) {
+			iomap->type = IOMAP_MAPPED;
 		} else {
 			WARN_ON_ONCE(1);
 			return -EIO;
@@ -3601,6 +3631,8 @@  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
+	if (flags & IOMAP_DIRECT && written == 0)
+		return -ENOTBLK;
 	return 0;
 }