diff mbox series

[07/11] xfs: split unaligned DIO write code out

Message ID 20210118193516.2915706-8-hch@lst.de (mailing list archive)
State Superseded
Headers show
Series [01/11] xfs: factor out a xfs_ilock_iocb helper | expand

Commit Message

Christoph Hellwig Jan. 18, 2021, 7:35 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

The unaligned DIO write path is more convolted than the normal path,
and we are about to make it more complex. Keep the block aligned
fast path dio write code trim and simple by splitting out the
unaligned DIO code from it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[hch: rebased, fixed a few minor nits]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
 1 file changed, 92 insertions(+), 76 deletions(-)

Comments

Brian Foster Jan. 19, 2021, 3:23 p.m. UTC | #1
On Mon, Jan 18, 2021 at 08:35:12PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The unaligned DIO write path is more convolted than the normal path,
> and we are about to make it more complex. Keep the block aligned
> fast path dio write code trim and simple by splitting out the
> unaligned DIO code from it.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> [hch: rebased, fixed a few minor nits]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
>  1 file changed, 92 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a696bd34f71d21..bffd7240cefb7f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -500,117 +500,133 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_write - handle direct IO writes
> + * Handle block aligned direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
> - * By separating it from the buffered write path we remove all the tricky to
> - * follow locking changes and looping.
>   *
>   * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
>   * until we're sure the bytes at the new EOF have been zeroed and/or the cached
>   * pages are flushed out.
> + */
> +static noinline ssize_t
> +xfs_file_dio_write_aligned(
> +	struct xfs_inode	*ip,
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	int			iolock = XFS_IOLOCK_SHARED;
> +	ssize_t			ret;
> +
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/*
> +	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
> +	 * the iolock back to shared if we had to take the exclusive lock in
> +	 * xfs_file_write_checks() for other reasons.
> +	 */
> +	if (iolock == XFS_IOLOCK_EXCL) {
> +		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +	trace_xfs_file_direct_write(iocb, from);
> +	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> +			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +out_unlock:
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
> +	return ret;
> +}
> +
> +/*
> + * Handle block unaligned direct IO writes
> + *
> + * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
> + * them to be done in parallel with reads and other direct IO writes.  However,
> + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
> + * need to do sub-block zeroing and that requires serialisation against other
> + * direct I/Os to the same block. In this case we need to serialise the
> + * submission of the unaligned I/Os so that we don't get racing block zeroing in
> + * the dio layer.
>   *
> - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
> - * allowing them to be done in parallel with reads and other direct IO writes.
> - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
> - * needs to do sub-block zeroing and that requires serialisation against other
> - * direct IOs to the same block. In this case we need to serialise the
> - * submission of the unaligned IOs so that we don't get racing block zeroing in
> - * the dio layer.  To avoid the problem with aio, we also need to wait for
> + * To provide the same serialisation for AIO, we also need to wait for
>   * outstanding IOs to complete so that unwritten extent conversion is completed
>   * before we try to map the overlapping block. This is currently implemented by
>   * hitting it with a big hammer (i.e. inode_dio_wait()).
>   *
> - * Returns with locks held indicated by @iolock and errors indicated by
> - * negative return values.
> + * This means that unaligned dio writes always block. There is no "nowait" fast
> + * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
> + * and we don't have to worry about that anymore.
>   */
> -STATIC ssize_t
> -xfs_file_dio_write(
> +static noinline ssize_t
> +xfs_file_dio_write_unaligned(
> +	struct xfs_inode	*ip,
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> -	struct file		*file = iocb->ki_filp;
> -	struct address_space	*mapping = file->f_mapping;
> -	struct inode		*inode = mapping->host;
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_mount	*mp = ip->i_mount;
> -	ssize_t			ret = 0;
> -	int			unaligned_io = 0;
> -	int			iolock;
> -	size_t			count = iov_iter_count(from);
> -	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	int			iolock = XFS_IOLOCK_EXCL;
> +	ssize_t			ret;
>  
> -	/* DIO must be aligned to device logical sector size */
> -	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> -		return -EINVAL;
> +	/* unaligned dio always waits, bail */
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EAGAIN;
> +	xfs_ilock(ip, iolock);
>  
>  	/*
> -	 * Don't take the exclusive iolock here unless the I/O is unaligned to
> -	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_write_checks() will relock
> -	 * the inode as necessary for EOF zeroing cases and fill out the new
> -	 * inode size as appropriate.
> +	 * We can't properly handle unaligned direct I/O to reflink files yet,
> +	 * as we can't unshare a partial block.
>  	 */
> -	if ((iocb->ki_pos & mp->m_blockmask) ||
> -	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
> -		unaligned_io = 1;
> -
> -		/*
> -		 * We can't properly handle unaligned direct I/O to reflink
> -		 * files yet, as we can't unshare a partial block.
> -		 */
> -		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(iocb, from);
> -			return -ENOTBLK;
> -		}
> -		iolock = XFS_IOLOCK_EXCL;
> -	} else {
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> -
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		/* unaligned dio always waits, bail */
> -		if (unaligned_io)
> -			return -EAGAIN;
> -		if (!xfs_ilock_nowait(ip, iolock))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, iolock);
> +	if (xfs_is_cow_inode(ip)) {
> +		trace_xfs_reflink_bounce_dio_write(iocb, from);
> +		ret = -ENOTBLK;
> +		goto out_unlock;
>  	}
>  
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
> -		goto out;
> -	count = iov_iter_count(from);
> +		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned IO, we can't allow any other overlapping IO
> -	 * in-flight at the same time or we risk data corruption. Wait for all
> -	 * other IO to drain before we submit. If the IO is aligned, demote the
> -	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_write_checks() for other reasons.
> +	 * If we are doing unaligned I/O, we can't allow any other overlapping
> +	 * I/O in-flight at the same time or we risk data corruption. Wait for
> +	 * all other I/O to drain before we submit.
>  	 */
> -	if (unaligned_io) {
> -		inode_dio_wait(inode);
> -	} else if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> +	inode_dio_wait(VFS_I(ip));
>  
> -	trace_xfs_file_direct_write(iocb, from);
>  	/*
> -	 * If unaligned, this is the only IO in-flight. Wait on it before we
> -	 * release the iolock to prevent subsequent overlapping IO.
> +	 * This must be the only I/O in-flight. Wait on it before we release the
> +	 * iolock to prevent subsequent overlapping I/O.
>  	 */
> +	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io);
> -out:
> +			   &xfs_dio_write_ops, true);
> +out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
>  	return ret;
>  }
>  
> +static ssize_t
> +xfs_file_dio_write(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	size_t			count = iov_iter_count(from);
> +
> +	/* DIO must be aligned to device logical sector size */
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> +		return -EINVAL;
> +	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +		return xfs_file_dio_write_unaligned(ip, iocb, from);
> +	return xfs_file_dio_write_aligned(ip, iocb, from);
> +}
> +
>  static noinline ssize_t
>  xfs_file_dax_write(
>  	struct kiocb		*iocb,
> -- 
> 2.29.2
>
Darrick J. Wong Jan. 20, 2021, 6:46 p.m. UTC | #2
On Mon, Jan 18, 2021 at 08:35:12PM +0100, Christoph Hellwig wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The unaligned DIO write path is more convolted than the normal path,
> and we are about to make it more complex. Keep the block aligned
> fast path dio write code trim and simple by splitting out the
> unaligned DIO code from it.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> [hch: rebased, fixed a few minor nits]
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c | 168 +++++++++++++++++++++++++---------------------
>  1 file changed, 92 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index a696bd34f71d21..bffd7240cefb7f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -500,117 +500,133 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
>  };
>  
>  /*
> - * xfs_file_dio_write - handle direct IO writes
> + * Handle block aligned direct IO writes
>   *
>   * Lock the inode appropriately to prepare for and issue a direct IO write.
> - * By separating it from the buffered write path we remove all the tricky to
> - * follow locking changes and looping.
>   *
>   * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
>   * until we're sure the bytes at the new EOF have been zeroed and/or the cached
>   * pages are flushed out.
> + */
> +static noinline ssize_t
> +xfs_file_dio_write_aligned(
> +	struct xfs_inode	*ip,
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	int			iolock = XFS_IOLOCK_SHARED;
> +	ssize_t			ret;
> +
> +	ret = xfs_ilock_iocb(iocb, iolock);
> +	if (ret)
> +		return ret;
> +	ret = xfs_file_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/*
> +	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
> +	 * the iolock back to shared if we had to take the exclusive lock in
> +	 * xfs_file_write_checks() for other reasons.
> +	 */
> +	if (iolock == XFS_IOLOCK_EXCL) {
> +		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +	trace_xfs_file_direct_write(iocb, from);
> +	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> +			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
> +out_unlock:
> +	if (iolock)
> +		xfs_iunlock(ip, iolock);
> +	return ret;
> +}
> +
> +/*
> + * Handle block unaligned direct IO writes
> + *
> + * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
> + * them to be done in parallel with reads and other direct IO writes.  However,
> + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
> + * need to do sub-block zeroing and that requires serialisation against other
> + * direct I/Os to the same block. In this case we need to serialise the
> + * submission of the unaligned I/Os so that we don't get racing block zeroing in
> + * the dio layer.
>   *
> - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
> - * allowing them to be done in parallel with reads and other direct IO writes.
> - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
> - * needs to do sub-block zeroing and that requires serialisation against other
> - * direct IOs to the same block. In this case we need to serialise the
> - * submission of the unaligned IOs so that we don't get racing block zeroing in
> - * the dio layer.  To avoid the problem with aio, we also need to wait for
> + * To provide the same serialisation for AIO, we also need to wait for
>   * outstanding IOs to complete so that unwritten extent conversion is completed
>   * before we try to map the overlapping block. This is currently implemented by
>   * hitting it with a big hammer (i.e. inode_dio_wait()).
>   *
> - * Returns with locks held indicated by @iolock and errors indicated by
> - * negative return values.
> + * This means that unaligned dio writes always block. There is no "nowait" fast
> + * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
> + * and we don't have to worry about that anymore.
>   */
> -STATIC ssize_t
> -xfs_file_dio_write(
> +static noinline ssize_t
> +xfs_file_dio_write_unaligned(
> +	struct xfs_inode	*ip,
>  	struct kiocb		*iocb,
>  	struct iov_iter		*from)
>  {
> -	struct file		*file = iocb->ki_filp;
> -	struct address_space	*mapping = file->f_mapping;
> -	struct inode		*inode = mapping->host;
> -	struct xfs_inode	*ip = XFS_I(inode);
> -	struct xfs_mount	*mp = ip->i_mount;
> -	ssize_t			ret = 0;
> -	int			unaligned_io = 0;
> -	int			iolock;
> -	size_t			count = iov_iter_count(from);
> -	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	int			iolock = XFS_IOLOCK_EXCL;
> +	ssize_t			ret;
>  
> -	/* DIO must be aligned to device logical sector size */
> -	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> -		return -EINVAL;
> +	/* unaligned dio always waits, bail */
> +	if (iocb->ki_flags & IOCB_NOWAIT)
> +		return -EAGAIN;
> +	xfs_ilock(ip, iolock);
>  
>  	/*
> -	 * Don't take the exclusive iolock here unless the I/O is unaligned to
> -	 * the file system block size.  We don't need to consider the EOF
> -	 * extension case here because xfs_file_write_checks() will relock
> -	 * the inode as necessary for EOF zeroing cases and fill out the new
> -	 * inode size as appropriate.
> +	 * We can't properly handle unaligned direct I/O to reflink files yet,
> +	 * as we can't unshare a partial block.
>  	 */
> -	if ((iocb->ki_pos & mp->m_blockmask) ||
> -	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
> -		unaligned_io = 1;
> -
> -		/*
> -		 * We can't properly handle unaligned direct I/O to reflink
> -		 * files yet, as we can't unshare a partial block.
> -		 */
> -		if (xfs_is_cow_inode(ip)) {
> -			trace_xfs_reflink_bounce_dio_write(iocb, from);
> -			return -ENOTBLK;
> -		}
> -		iolock = XFS_IOLOCK_EXCL;
> -	} else {
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> -
> -	if (iocb->ki_flags & IOCB_NOWAIT) {
> -		/* unaligned dio always waits, bail */
> -		if (unaligned_io)
> -			return -EAGAIN;
> -		if (!xfs_ilock_nowait(ip, iolock))
> -			return -EAGAIN;
> -	} else {
> -		xfs_ilock(ip, iolock);
> +	if (xfs_is_cow_inode(ip)) {
> +		trace_xfs_reflink_bounce_dio_write(iocb, from);
> +		ret = -ENOTBLK;
> +		goto out_unlock;
>  	}
>  
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
> -		goto out;
> -	count = iov_iter_count(from);
> +		goto out_unlock;
>  
>  	/*
> -	 * If we are doing unaligned IO, we can't allow any other overlapping IO
> -	 * in-flight at the same time or we risk data corruption. Wait for all
> -	 * other IO to drain before we submit. If the IO is aligned, demote the
> -	 * iolock if we had to take the exclusive lock in
> -	 * xfs_file_write_checks() for other reasons.
> +	 * If we are doing unaligned I/O, we can't allow any other overlapping
> +	 * I/O in-flight at the same time or we risk data corruption. Wait for
> +	 * all other I/O to drain before we submit.
>  	 */
> -	if (unaligned_io) {
> -		inode_dio_wait(inode);
> -	} else if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> -	}
> +	inode_dio_wait(VFS_I(ip));
>  
> -	trace_xfs_file_direct_write(iocb, from);
>  	/*
> -	 * If unaligned, this is the only IO in-flight. Wait on it before we
> -	 * release the iolock to prevent subsequent overlapping IO.
> +	 * This must be the only I/O in-flight. Wait on it before we release the
> +	 * iolock to prevent subsequent overlapping I/O.
>  	 */
> +	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> -			   &xfs_dio_write_ops,
> -			   is_sync_kiocb(iocb) || unaligned_io);
> -out:
> +			   &xfs_dio_write_ops, true);
> +out_unlock:
>  	if (iolock)
>  		xfs_iunlock(ip, iolock);
>  	return ret;
>  }
>  
> +static ssize_t
> +xfs_file_dio_write(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
> +	size_t			count = iov_iter_count(from);
> +
> +	/* DIO must be aligned to device logical sector size */
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> +		return -EINVAL;
> +	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +		return xfs_file_dio_write_unaligned(ip, iocb, from);
> +	return xfs_file_dio_write_aligned(ip, iocb, from);
> +}
> +
>  static noinline ssize_t
>  xfs_file_dax_write(
>  	struct kiocb		*iocb,
> -- 
> 2.29.2
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a696bd34f71d21..bffd7240cefb7f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -500,117 +500,133 @@  static const struct iomap_dio_ops xfs_dio_write_ops = {
 };
 
 /*
- * xfs_file_dio_write - handle direct IO writes
+ * Handle block aligned direct IO writes
  *
  * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
  *
  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
  * pages are flushed out.
+ */
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+	struct xfs_inode	*ip,
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	int			iolock = XFS_IOLOCK_SHARED;
+	ssize_t			ret;
+
+	ret = xfs_ilock_iocb(iocb, iolock);
+	if (ret)
+		return ret;
+	ret = xfs_file_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
+	 * the iolock back to shared if we had to take the exclusive lock in
+	 * xfs_file_write_checks() for other reasons.
+	 */
+	if (iolock == XFS_IOLOCK_EXCL) {
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+	trace_xfs_file_direct_write(iocb, from);
+	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+			   &xfs_dio_write_ops, is_sync_kiocb(iocb));
+out_unlock:
+	if (iolock)
+		xfs_iunlock(ip, iolock);
+	return ret;
+}
+
+/*
+ * Handle block unaligned direct IO writes
+ *
+ * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct IO writes.  However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
+ * need to do sub-block zeroing and that requires serialisation against other
+ * direct I/Os to the same block. In this case we need to serialise the
+ * submission of the unaligned I/Os so that we don't get racing block zeroing in
+ * the dio layer.
  *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * To provide the same serialisation for AIO, we also need to wait for
  * outstanding IOs to complete so that unwritten extent conversion is completed
  * before we try to map the overlapping block. This is currently implemented by
  * hitting it with a big hammer (i.e. inode_dio_wait()).
  *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * This means that unaligned dio writes always block. There is no "nowait" fast
+ * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
+ * and we don't have to worry about that anymore.
  */
-STATIC ssize_t
-xfs_file_dio_write(
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+	struct xfs_inode	*ip,
 	struct kiocb		*iocb,
 	struct iov_iter		*from)
 {
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			ret = 0;
-	int			unaligned_io = 0;
-	int			iolock;
-	size_t			count = iov_iter_count(from);
-	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	int			iolock = XFS_IOLOCK_EXCL;
+	ssize_t			ret;
 
-	/* DIO must be aligned to device logical sector size */
-	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
-		return -EINVAL;
+	/* unaligned dio always waits, bail */
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
+	xfs_ilock(ip, iolock);
 
 	/*
-	 * Don't take the exclusive iolock here unless the I/O is unaligned to
-	 * the file system block size.  We don't need to consider the EOF
-	 * extension case here because xfs_file_write_checks() will relock
-	 * the inode as necessary for EOF zeroing cases and fill out the new
-	 * inode size as appropriate.
+	 * We can't properly handle unaligned direct I/O to reflink files yet,
+	 * as we can't unshare a partial block.
 	 */
-	if ((iocb->ki_pos & mp->m_blockmask) ||
-	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
-		unaligned_io = 1;
-
-		/*
-		 * We can't properly handle unaligned direct I/O to reflink
-		 * files yet, as we can't unshare a partial block.
-		 */
-		if (xfs_is_cow_inode(ip)) {
-			trace_xfs_reflink_bounce_dio_write(iocb, from);
-			return -ENOTBLK;
-		}
-		iolock = XFS_IOLOCK_EXCL;
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		/* unaligned dio always waits, bail */
-		if (unaligned_io)
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, iolock))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, iolock);
+	if (xfs_is_cow_inode(ip)) {
+		trace_xfs_reflink_bounce_dio_write(iocb, from);
+		ret = -ENOTBLK;
+		goto out_unlock;
 	}
 
 	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
-		goto out;
-	count = iov_iter_count(from);
+		goto out_unlock;
 
 	/*
-	 * If we are doing unaligned IO, we can't allow any other overlapping IO
-	 * in-flight at the same time or we risk data corruption. Wait for all
-	 * other IO to drain before we submit. If the IO is aligned, demote the
-	 * iolock if we had to take the exclusive lock in
-	 * xfs_file_write_checks() for other reasons.
+	 * If we are doing unaligned I/O, we can't allow any other overlapping
+	 * I/O in-flight at the same time or we risk data corruption. Wait for
+	 * all other I/O to drain before we submit.
 	 */
-	if (unaligned_io) {
-		inode_dio_wait(inode);
-	} else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
-	}
+	inode_dio_wait(VFS_I(ip));
 
-	trace_xfs_file_direct_write(iocb, from);
 	/*
-	 * If unaligned, this is the only IO in-flight. Wait on it before we
-	 * release the iolock to prevent subsequent overlapping IO.
+	 * This must be the only I/O in-flight. Wait on it before we release the
+	 * iolock to prevent subsequent overlapping I/O.
 	 */
+	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
-			   &xfs_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io);
-out:
+			   &xfs_dio_write_ops, true);
+out_unlock:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
 	return ret;
 }
 
+static ssize_t
+xfs_file_dio_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+	size_t			count = iov_iter_count(from);
+
+	/* DIO must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+		return -EINVAL;
+	if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+		return xfs_file_dio_write_unaligned(ip, iocb, from);
+	return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
 static noinline ssize_t
 xfs_file_dax_write(
 	struct kiocb		*iocb,