@@ -544,22 +544,35 @@ xfs_file_dio_write_aligned(
/*
* Handle block unaligned direct IO writes
*
- * In most cases direct IO writes will be done holding IOLOCK_SHARED, allowing
- * them to be done in parallel with reads and other direct IO writes. However,
- * if the I/O is not aligned to filesystem blocks, the direct I/O layer may
- * need to do sub-block zeroing and that requires serialisation against other
- * direct I/Os to the same block. In this case we need to serialise the
- * submission of the unaligned I/Os so that we don't get racing block zeroing in
- * the dio layer.
+ * In most cases direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * may need to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In the case where sub-block zeroing is not
+ * required, we can do concurrent sub-block dios to the same block successfully.
*
- * To provide the same serialisation for AIO, we also need to wait for
+ * Hence we have two cases here - the shared, optimisitic fast path for written
+ * extents, and everything else that needs exclusive IO path access across the
+ * entire IO.
+ *
+ * For the first case, we do all the checks we need at the mapping layer in the
+ * DIO code as part of the existing NOWAIT infrastructure. Hence all we need to
+ * do to support concurrent subblock dio is first try a non-blocking submission.
+ * If that returns -EAGAIN, then we simply repeat the IO submission with full
+ * IO exclusivity guaranteed so that we avoid racing sub-block zeroing.
+ *
+ * The only wrinkle in this case is that the iomap DIO code always does
+ * partial tail sub-block zeroing for post-EOF writes. Hence for any IO that
+ * _ends_ past the current EOF we need to run with full exclusivity. Note that
+ * we also check for the start of IO being beyond EOF because then zeroing
+ * between the old EOF and the start of the IO is required and that also
+ * requires exclusivity. Hence we avoid lock cycles and blocking under
+ * IOCB_NOWAIT for this situation, too.
+ *
+ * To provide the exclusivity required when using AIO, we also need to wait for
* outstanding IOs to complete so that unwritten extent conversion is completed
* before we try to map the overlapping block. This is currently implemented by
* hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * This means that unaligned dio writes always block. There is no "nowait" fast
- * path in this code - if IOCB_NOWAIT is set we simply return -EAGAIN up front
- * and we don't have to worry about that anymore.
*/
static noinline ssize_t
xfs_file_dio_write_unaligned(
@@ -567,13 +580,27 @@ xfs_file_dio_write_unaligned(
struct kiocb *iocb,
struct iov_iter *from)
{
- int iolock = XFS_IOLOCK_EXCL;
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_NOALLOC;
ssize_t ret;
- /* unaligned dio always waits, bail */
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- xfs_ilock(ip, iolock);
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+retry_exclusive:
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
+ }
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
/*
* We can't properly handle unaligned direct I/O to reflink files yet,
@@ -590,19 +617,27 @@ xfs_file_dio_write_unaligned(
goto out_unlock;
/*
- * If we are doing unaligned I/O, we can't allow any other overlapping
- * I/O in-flight at the same time or we risk data corruption. Wait for
- * all other I/O to drain before we submit.
+ * If we are doing exclusive unaligned IO, we can't allow any other
+ * overlapping IO in-flight at the same time or we risk data corruption.
+ * Wait for all other IO to drain before we submit.
*/
- inode_dio_wait(VFS_I(ip));
+ if (!(flags & IOMAP_DIO_NOALLOC))
+ inode_dio_wait(VFS_I(ip));
- /*
- * This must be the only I/O in-flight. Wait on it before we release the
- * iolock to prevent subsequent overlapping I/O.
- */
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, IOMAP_DIO_FORCE_WAIT);
+ &xfs_dio_write_ops, flags);
+ /*
+ * Retry unaligned IO with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user IO, propagate the error.
+ */
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_NOALLOC);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -784,15 +784,30 @@ xfs_direct_write_iomap_begin(
goto allocate_blocks;
/*
- * NOWAIT IO needs to span the entire requested IO with a single map so
- * that we avoid partial IO failures due to the rest of the IO range not
- * covered by this map triggering an EAGAIN condition when it is
- * subsequently mapped and aborting the IO.
+ * NOWAIT and NOALLOC IO needs to span the entire requested IO with a
+ * single map so that we avoid partial IO failures due to the rest of
+ * the IO range not covered by this map triggering an EAGAIN condition
+ * when it is subsequently mapped and aborting the IO.
*/
- if ((flags & IOMAP_NOWAIT) &&
- !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+ if (flags & (IOMAP_NOWAIT | IOMAP_NOALLOC)) {
error = -EAGAIN;
- goto out_unlock;
+ if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+ goto out_unlock;
+ }
+
+ /*
+ * For NOALLOC I/O we can't convert an unwritten extents if the I/O is
+ * not block size aligned, as such a conversion would have to do
+ * sub-block zeroing, and that can only be done under an exclusive
+ * IOLOCK. Hence if this is not a written extent, return EAGAIN to tell
+ * the caller to try again.
+ */
+ if (flags & IOMAP_NOALLOC) {
+ error = -EAGAIN;
+ if (imap.br_state != XFS_EXT_NORM &&
+ ((offset & mp->m_blockmask) ||
+ ((offset + length) & mp->m_blockmask)))
+ goto out_unlock;
}
xfs_iunlock(ip, lockmode);
@@ -801,7 +816,7 @@ xfs_direct_write_iomap_begin(
allocate_blocks:
error = -EAGAIN;
- if (flags & IOMAP_NOWAIT)
+ if (flags & (IOMAP_NOWAIT | IOMAP_NOALLOC))
goto out_unlock;
/*