diff mbox

[7/8] xfs: split direct I/O and DAX path

Message ID 1466609236-23801-8-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig June 22, 2016, 3:27 p.m. UTC
So far the DAX code overloaded the direct I/O code path.  There is very little
in common between the two, and untangling them allows to clean up both variants.

As a ѕide effect we also get separate trace points for both I/O types.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_file.c  | 139 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_trace.h |   2 +
 2 files changed, 112 insertions(+), 29 deletions(-)

Comments

Darrick J. Wong Sept. 29, 2016, 2:53 a.m. UTC | #1
On Wed, Jun 22, 2016 at 05:27:15PM +0200, Christoph Hellwig wrote:
> So far the DAX code overloaded the direct I/O code path.  There is very little
> in common between the two, and untangling them allows to clean up both variants.
> 
> As a ѕide effect we also get separate trace points for both I/O types.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_file.c  | 139 ++++++++++++++++++++++++++++++++++++++++++-----------
>  fs/xfs/xfs_trace.h |   2 +
>  2 files changed, 112 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 24b267d..0e74325 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -262,13 +262,11 @@ xfs_file_dio_aio_read(
>  	else
>  		target = ip->i_mount->m_ddev_targp;
>  
> -	if (!IS_DAX(inode)) {
> -		/* DIO must be aligned to device logical sector size */
> -		if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
> -			if (iocb->ki_pos == isize)
> -				return 0;
> -			return -EINVAL;
> -		}
> +	/* DIO must be aligned to device logical sector size */
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
> +		if (iocb->ki_pos == isize)
> +			return 0;
> +		return -EINVAL;
>  	}
>  
>  	/*
> @@ -317,13 +315,37 @@ xfs_file_dio_aio_read(
>  	}
>  
>  	data = *to;
> -	if (IS_DAX(inode)) {
> -		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> -				NULL, 0);
> -	} else {
> -		ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> -				xfs_get_blocks_direct, NULL, NULL, 0);
> +	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> +			xfs_get_blocks_direct, NULL, NULL, 0);
> +	if (ret > 0) {
> +		iocb->ki_pos += ret;
> +		iov_iter_advance(to, ret);
>  	}
> +	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
> +
> +	file_accessed(iocb->ki_filp);

So I noticed that generic/323 starts crashing in file_accessed -> touch_atime
because iocb->ki_filp->f_path.dentry == NULL.  For a while I thought it was
some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8
kernel and that blew up here too.  I'm not sure why this line got inserted
here, since it wasn't there prior to this patch, AFAICT.

<shrug>

--D

> +	return ret;
> +}
> +
> +STATIC ssize_t
> +xfs_file_dax_read(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*to)
> +{
> +	struct address_space	*mapping = iocb->ki_filp->f_mapping;
> +	struct inode		*inode = mapping->host;
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct iov_iter		data = *to;
> +	size_t			count = iov_iter_count(to);
> +	ssize_t			ret = 0;
> +
> +	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
> +
> +	if (!count)
> +		return 0; /* skip atime */
> +
> +	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
> +	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
>  	if (ret > 0) {
>  		iocb->ki_pos += ret;
>  		iov_iter_advance(to, ret);
> @@ -356,7 +378,8 @@ xfs_file_read_iter(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
> -	struct xfs_mount	*mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
> +	struct inode		*inode = file_inode(iocb->ki_filp);
> +	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
>  	ssize_t			ret = 0;
>  
>  	XFS_STATS_INC(mp, xs_read_calls);
> @@ -364,7 +387,9 @@ xfs_file_read_iter(
>  	if (XFS_FORCED_SHUTDOWN(mp))
>  		return -EIO;
>  
> -	if (iocb->ki_flags & IOCB_DIRECT)
> +	if (IS_DAX(inode))
> +		ret = xfs_file_dax_read(iocb, to);
> +	else if (iocb->ki_flags & IOCB_DIRECT)
>  		ret = xfs_file_dio_aio_read(iocb, to);
>  	else
>  		ret = xfs_file_buffered_aio_read(iocb, to);
> @@ -586,8 +611,7 @@ xfs_file_dio_aio_write(
>  					mp->m_rtdev_targp : mp->m_ddev_targp;
>  
>  	/* DIO must be aligned to device logical sector size */
> -	if (!IS_DAX(inode) &&
> -	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
> +	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>  		return -EINVAL;
>  
>  	/* "unaligned" here means not aligned to a filesystem block */
> @@ -656,14 +680,9 @@ xfs_file_dio_aio_write(
>  	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
>  
>  	data = *from;
> -	if (IS_DAX(inode)) {
> -		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> -				xfs_end_io_direct_write, 0);
> -	} else {
> -		ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> -				xfs_get_blocks_direct, xfs_end_io_direct_write,
> -				NULL, DIO_ASYNC_EXTEND);
> -	}
> +	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> +			xfs_get_blocks_direct, xfs_end_io_direct_write,
> +			NULL, DIO_ASYNC_EXTEND);
>  
>  	/* see generic_file_direct_write() for why this is necessary */
>  	if (mapping->nrpages) {
> @@ -680,10 +699,70 @@ out:
>  	xfs_rw_iunlock(ip, iolock);
>  
>  	/*
> -	 * No fallback to buffered IO on errors for XFS. DAX can result in
> -	 * partial writes, but direct IO will either complete fully or fail.
> +	 * No fallback to buffered IO on errors for XFS, direct IO will either
> +	 * complete fully or fail.
> +	 */
> +	ASSERT(ret < 0 || ret == count);
> +	return ret;
> +}
> +
> +STATIC ssize_t
> +xfs_file_dax_write(
> +	struct kiocb		*iocb,
> +	struct iov_iter		*from)
> +{
> +	struct address_space	*mapping = iocb->ki_filp->f_mapping;
> +	struct inode		*inode = mapping->host;
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	ssize_t			ret = 0;
> +	int			unaligned_io = 0;
> +	int			iolock;
> +	struct iov_iter		data;
> +
> +	/* "unaligned" here means not aligned to a filesystem block */
> +	if ((iocb->ki_pos & mp->m_blockmask) ||
> +	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
> +		unaligned_io = 1;
> +		iolock = XFS_IOLOCK_EXCL;
> +	} else if (mapping->nrpages) {
> +		iolock = XFS_IOLOCK_EXCL;
> +	} else {
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +	xfs_rw_ilock(ip, iolock);
> +
> +	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +	if (ret)
> +		goto out;
> +
> +	/*
> +	 * Yes, even DAX files can have page cache attached to them:  A zeroed
> +	 * page is inserted into the pagecache when we have to serve a write
> +	 * fault on a hole.  It should never be dirtied and can simply be
> +	 * dropped from the pagecache once we get real data for the page.
>  	 */
> -	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
> +	if (mapping->nrpages) {
> +		ret = invalidate_inode_pages2(mapping);
> +		WARN_ON_ONCE(ret);
> +	}
> +
> +	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
> +		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +		iolock = XFS_IOLOCK_SHARED;
> +	}
> +
> +	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
> +
> +	data = *from;
> +	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> +			xfs_end_io_direct_write, 0);
> +	if (ret > 0) {
> +		iocb->ki_pos += ret;
> +		iov_iter_advance(from, ret);
> +	}
> +out:
> +	xfs_rw_iunlock(ip, iolock);
>  	return ret;
>  }
>  
> @@ -765,7 +844,9 @@ xfs_file_write_iter(
>  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
>  		return -EIO;
>  
> -	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
> +	if (IS_DAX(inode))
> +		ret = xfs_file_dax_write(iocb, from);
> +	else if (iocb->ki_flags & IOCB_DIRECT)
>  		ret = xfs_file_dio_aio_write(iocb, from);
>  	else
>  		ret = xfs_file_buffered_aio_write(iocb, from);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 2504f94..1451690 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name,	\
>  	TP_ARGS(ip, count, offset))
>  DEFINE_RW_EVENT(xfs_file_buffered_read);
>  DEFINE_RW_EVENT(xfs_file_direct_read);
> +DEFINE_RW_EVENT(xfs_file_dax_read);
>  DEFINE_RW_EVENT(xfs_file_buffered_write);
>  DEFINE_RW_EVENT(xfs_file_direct_write);
> +DEFINE_RW_EVENT(xfs_file_dax_write);
>  DEFINE_RW_EVENT(xfs_file_splice_read);
>  
>  DECLARE_EVENT_CLASS(xfs_page_class,
> -- 
> 2.1.4
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
diff mbox

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 24b267d..0e74325 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -262,13 +262,11 @@  xfs_file_dio_aio_read(
 	else
 		target = ip->i_mount->m_ddev_targp;
 
-	if (!IS_DAX(inode)) {
-		/* DIO must be aligned to device logical sector size */
-		if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
-			if (iocb->ki_pos == isize)
-				return 0;
-			return -EINVAL;
-		}
+	/* DIO must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+		if (iocb->ki_pos == isize)
+			return 0;
+		return -EINVAL;
 	}
 
 	/*
@@ -317,13 +315,37 @@  xfs_file_dio_aio_read(
 	}
 
 	data = *to;
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-				NULL, 0);
-	} else {
-		ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-				xfs_get_blocks_direct, NULL, NULL, 0);
+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+			xfs_get_blocks_direct, NULL, NULL, 0);
+	if (ret > 0) {
+		iocb->ki_pos += ret;
+		iov_iter_advance(to, ret);
 	}
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct iov_iter		data = *to;
+	size_t			count = iov_iter_count(to);
+	ssize_t			ret = 0;
+
+	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+	if (!count)
+		return 0; /* skip atime */
+
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
 	if (ret > 0) {
 		iocb->ki_pos += ret;
 		iov_iter_advance(to, ret);
@@ -356,7 +378,8 @@  xfs_file_read_iter(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
-	struct xfs_mount	*mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
 	ssize_t			ret = 0;
 
 	XFS_STATS_INC(mp, xs_read_calls);
@@ -364,7 +387,9 @@  xfs_file_read_iter(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (iocb->ki_flags & IOCB_DIRECT)
+	if (IS_DAX(inode))
+		ret = xfs_file_dax_read(iocb, to);
+	else if (iocb->ki_flags & IOCB_DIRECT)
 		ret = xfs_file_dio_aio_read(iocb, to);
 	else
 		ret = xfs_file_buffered_aio_read(iocb, to);
@@ -586,8 +611,7 @@  xfs_file_dio_aio_write(
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
 	/* DIO must be aligned to device logical sector size */
-	if (!IS_DAX(inode) &&
-	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;
 
 	/* "unaligned" here means not aligned to a filesystem block */
@@ -656,14 +680,9 @@  xfs_file_dio_aio_write(
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 
 	data = *from;
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-				xfs_end_io_direct_write, 0);
-	} else {
-		ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-				xfs_get_blocks_direct, xfs_end_io_direct_write,
-				NULL, DIO_ASYNC_EXTEND);
-	}
+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+			xfs_get_blocks_direct, xfs_end_io_direct_write,
+			NULL, DIO_ASYNC_EXTEND);
 
 	/* see generic_file_direct_write() for why this is necessary */
 	if (mapping->nrpages) {
@@ -680,10 +699,70 @@  out:
 	xfs_rw_iunlock(ip, iolock);
 
 	/*
-	 * No fallback to buffered IO on errors for XFS. DAX can result in
-	 * partial writes, but direct IO will either complete fully or fail.
+	 * No fallback to buffered IO on errors for XFS, direct IO will either
+	 * complete fully or fail.
+	 */
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	int			unaligned_io = 0;
+	int			iolock;
+	struct iov_iter		data;
+
+	/* "unaligned" here means not aligned to a filesystem block */
+	if ((iocb->ki_pos & mp->m_blockmask) ||
+	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+		unaligned_io = 1;
+		iolock = XFS_IOLOCK_EXCL;
+	} else if (mapping->nrpages) {
+		iolock = XFS_IOLOCK_EXCL;
+	} else {
+		iolock = XFS_IOLOCK_SHARED;
+	}
+	xfs_rw_ilock(ip, iolock);
+
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+
+	/*
+	 * Yes, even DAX files can have page cache attached to them:  A zeroed
+	 * page is inserted into the pagecache when we have to serve a write
+	 * fault on a hole.  It should never be dirtied and can simply be
+	 * dropped from the pagecache once we get real data for the page.
 	 */
-	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+	if (mapping->nrpages) {
+		ret = invalidate_inode_pages2(mapping);
+		WARN_ON_ONCE(ret);
+	}
+
+	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+	data = *from;
+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+			xfs_end_io_direct_write, 0);
+	if (ret > 0) {
+		iocb->ki_pos += ret;
+		iov_iter_advance(from, ret);
+	}
+out:
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -765,7 +844,9 @@  xfs_file_write_iter(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+	if (IS_DAX(inode))
+		ret = xfs_file_dax_write(iocb, from);
+	else if (iocb->ki_flags & IOCB_DIRECT)
 		ret = xfs_file_dio_aio_write(iocb, from);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, from);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2504f94..1451690 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1165,8 +1165,10 @@  DEFINE_EVENT(xfs_file_class, name,	\
 	TP_ARGS(ip, count, offset))
 DEFINE_RW_EVENT(xfs_file_buffered_read);
 DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 
 DECLARE_EVENT_CLASS(xfs_page_class,