diff mbox series

[v3] xfs: allow read IO and FICLONE to run concurrently

Message ID 20231012010845.64286-1-catherine.hoang@oracle.com (mailing list archive)
State Superseded
Headers show
Series [v3] xfs: allow read IO and FICLONE to run concurrently | expand

Commit Message

Catherine Hoang Oct. 12, 2023, 1:08 a.m. UTC
Clone operations and read IO do not change any data in the source file, so they
should be able to run concurrently. Demote the exclusive locks taken by FICLONE
to shared locks to allow reads while cloning. While a clone is in progress,
writes will take the IOLOCK_EXCL, so they block until the clone completes.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/xfs/xfs_file.c    | 68 +++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_inode.c   | 17 +++++++++++
 fs/xfs/xfs_inode.h   |  9 ++++++
 fs/xfs/xfs_reflink.c |  4 +++
 4 files changed, 85 insertions(+), 13 deletions(-)

Comments

Darrick J. Wong Oct. 12, 2023, 6:26 a.m. UTC | #1
On Wed, Oct 11, 2023 at 06:08:45PM -0700, Catherine Hoang wrote:
> Clone operations and read IO do not change any data in the source file, so they
> should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> to shared locks to allow reads while cloning. While a clone is in progress,
> writes will take the IOLOCK_EXCL, so they block until the clone completes.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/xfs/xfs_file.c    | 68 +++++++++++++++++++++++++++++++++++---------
>  fs/xfs/xfs_inode.c   | 17 +++++++++++
>  fs/xfs/xfs_inode.h   |  9 ++++++
>  fs/xfs/xfs_reflink.c |  4 +++
>  4 files changed, 85 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..425507b0d1cb 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -214,6 +214,48 @@ xfs_ilock_iocb(
>  	return 0;
>  }
>  
> +static int
> +xfs_ilock_iocb_for_write(
> +	struct kiocb		*iocb,
> +	unsigned int		*lock_mode)
> +{
> +	ssize_t			ret;
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +
> +	do {
> +		ret = xfs_ilock_iocb(iocb, *lock_mode);
> +		if (ret)
> +			return ret;
> +		if (*lock_mode == XFS_IOLOCK_EXCL)
> +			return 0;
> +		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
> +			return 0;
> +		xfs_iunlock(ip, *lock_mode);
> +		*lock_mode = XFS_IOLOCK_EXCL;
> +	} while (1);
> +	/* notreached */

Might want to change that to an ASSERT(0) so we'll notice if this ever
breaks.

> +	return -EAGAIN;
> +}
> +
> +static int

"unsigned int", please be consistent with the other inode lock flag
usage around the codebase.

> +xfs_ilock_for_write_fault(
> +	struct xfs_inode	*ip)
> +{
> +	int			lock_mode = XFS_MMAPLOCK_SHARED;
> +
> +	do {
> +		xfs_ilock(ip, lock_mode);
> +		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
> +			return lock_mode;
> +		if (lock_mode == XFS_MMAPLOCK_EXCL)
> +			return lock_mode;
> +		xfs_iunlock(ip, lock_mode);
> +		lock_mode = XFS_MMAPLOCK_EXCL;
> +	} while (1);
> +	/* notreached */
> +	return 0;

I wonder if this would be easier to understand with the loop unrolled?

	/* get a shared lock if no remapping in progress */
	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
		return XFS_MMAPLOCK_SHARED;

	/* wait for remapping to complete */
	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
	return XFS_MMAPLOCK_EXCL;

It eliminates the awkward /* notreached */ construction.

Thoughts?

> +}
> +
>  STATIC ssize_t
>  xfs_file_dio_read(
>  	struct kiocb		*iocb,
> @@ -551,7 +593,7 @@ xfs_file_dio_write_aligned(
>  	unsigned int		iolock = XFS_IOLOCK_SHARED;
>  	ssize_t			ret;
>  
> -	ret = xfs_ilock_iocb(iocb, iolock);
> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>  	if (ret)
>  		return ret;
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
> @@ -618,7 +660,7 @@ xfs_file_dio_write_unaligned(
>  		flags = IOMAP_DIO_FORCE_WAIT;
>  	}
>  
> -	ret = xfs_ilock_iocb(iocb, iolock);
> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>  	if (ret)
>  		return ret;
>  
> @@ -1180,7 +1222,7 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iunlock2_remapping(src, dest);
>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> @@ -1328,6 +1370,7 @@ __xfs_filemap_fault(
>  	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	vm_fault_t		ret;
> +	int			lock_mode = 0;

unsigned int

--D

>  
>  	trace_xfs_filemap_fault(ip, order, write_fault);
>  
> @@ -1336,25 +1379,24 @@ __xfs_filemap_fault(
>  		file_update_time(vmf->vma->vm_file);
>  	}
>  
> +	if (IS_DAX(inode) || write_fault)
> +		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
> +
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> -		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, order, pfn);
> -		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +	} else if (write_fault) {
> +		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
>  	} else {
> -		if (write_fault) {
> -			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -			ret = iomap_page_mkwrite(vmf,
> -					&xfs_page_mkwrite_iomap_ops);
> -			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -		} else {
> -			ret = filemap_fault(vmf);
> -		}
> +		ret = filemap_fault(vmf);
>  	}
>  
> +	if (lock_mode)
> +		xfs_iunlock(XFS_I(inode), lock_mode);
> +
>  	if (write_fault)
>  		sb_end_pagefault(inode->i_sb);
>  	return ret;
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4d55f58d99b7..97b0078249fd 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3621,6 +3621,23 @@ xfs_iunlock2_io_mmap(
>  		inode_unlock(VFS_I(ip1));
>  }
>  
> +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
> +void
> +xfs_iunlock2_remapping(
> +	struct xfs_inode	*ip1,
> +	struct xfs_inode	*ip2)
> +{
> +	xfs_iflags_clear(ip1, XFS_IREMAPPING);
> +
> +	if (ip1 != ip2)
> +		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
> +	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
> +
> +	if (ip1 != ip2)
> +		inode_unlock_shared(VFS_I(ip1));
> +	inode_unlock(VFS_I(ip2));
> +}
> +
>  /*
>   * Reload the incore inode list for this inode.  Caller should ensure that
>   * the link count cannot change, either by taking ILOCK_SHARED or otherwise
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..3dc47937da5d 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  /* Quotacheck is running but inode has not been added to quota counts. */
>  #define XFS_IQUOTAUNCHECKED	(1 << 14)
>  
> +/*
> + * Remap in progress. Callers that wish to update file data while
> + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
> + * the lock in exclusive mode. Relocking the file will block until
> + * IREMAPPING is cleared.
> + */
> +#define XFS_IREMAPPING		(1U << 15)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  
>  static inline bool
>  xfs_inode_unlinked_incomplete(
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index eb9102453aff..658edee8381d 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
>  	if (ret)
>  		goto out_unlock;
>  
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +	if (inode_in != inode_out)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +
>  	return 0;
>  out_unlock:
>  	xfs_iunlock2_io_mmap(src, dest);
> -- 
> 2.34.1
>
Christoph Hellwig Oct. 12, 2023, 8:34 a.m. UTC | #2
On Wed, Oct 11, 2023 at 06:08:45PM -0700, Catherine Hoang wrote:
> Clone operations and read IO do not change any data in the source file, so they
> should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> to shared locks to allow reads while cloning. While a clone is in progress,
> writes will take the IOLOCK_EXCL, so they block until the clone completes.

FYI, the first two lines are too long for the normal commit log format.

Can you provide a justification for the change, i.e. worksloads for
which this matters, and what metrics are improved?
Darrick J. Wong Oct. 12, 2023, 3:02 p.m. UTC | #3
On Thu, Oct 12, 2023 at 01:34:34AM -0700, Christoph Hellwig wrote:
> On Wed, Oct 11, 2023 at 06:08:45PM -0700, Catherine Hoang wrote:
> > Clone operations and read IO do not change any data in the source file, so they
> > should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> > to shared locks to allow reads while cloning. While a clone is in progress,
> > writes will take the IOLOCK_EXCL, so they block until the clone completes.
> 
> FYI, the first two lines are too long for the normal commit log format.
> 
> Can you provide a justification for the change, i.e. worksloads for
> which this matters, and what metrics are improved?

Catherine started with this,
https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/

and the rest of us whittled it down to the single patch you see here.
Sections 1-2 are still relevant; S3 was the path not taken.

The commit message should Link: to that.

--D
Christoph Hellwig Oct. 12, 2023, 3:07 p.m. UTC | #4
On Thu, Oct 12, 2023 at 08:02:31AM -0700, Darrick J. Wong wrote:
> Catherine started with this,
> https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
> 
> and the rest of us whittled it down to the single patch you see here.
> Sections 1-2 are still relevant; S3 was the path not taken.

I'd still take the core of that into the actual commit message.  This
part, maybe slightly rewored:

"One of our VM cluster management products needs to snapshot KVM image
files so that they can be restored in case of failure. Snapshotting is
done by redirecting VM disk writes to a sidecar file and using reflink
on the disk image, specifically the FICLONE ioctl as used by
"cp --reflink". Reflink locks the source and destination files while it
operates, which means that reads from the main vm disk image are blocked,
causing the vm to stall. When an image file is heavily fragmented, the
copy process could take several minutes. Some of the vm image files have
50-100 million extent records, and duplicating that much metadata locks
the file for 30 minutes or more. Having activities suspended for such
a long time in a cluster node could result in node eviction."
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..425507b0d1cb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,48 @@  xfs_ilock_iocb(
 	return 0;
 }
 
+static int
+xfs_ilock_iocb_for_write(
+	struct kiocb		*iocb,
+	unsigned int		*lock_mode)
+{
+	ssize_t			ret;
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	do {
+		ret = xfs_ilock_iocb(iocb, *lock_mode);
+		if (ret)
+			return ret;
+		if (*lock_mode == XFS_IOLOCK_EXCL)
+			return 0;
+		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+			return 0;
+		xfs_iunlock(ip, *lock_mode);
+		*lock_mode = XFS_IOLOCK_EXCL;
+	} while (1);
+	/* notreached */
+	return -EAGAIN;
+}
+
+static int
+xfs_ilock_for_write_fault(
+	struct xfs_inode	*ip)
+{
+	int			lock_mode = XFS_MMAPLOCK_SHARED;
+
+	do {
+		xfs_ilock(ip, lock_mode);
+		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+			return lock_mode;
+		if (lock_mode == XFS_MMAPLOCK_EXCL)
+			return lock_mode;
+		xfs_iunlock(ip, lock_mode);
+		lock_mode = XFS_MMAPLOCK_EXCL;
+	} while (1);
+	/* notreached */
+	return 0;
+}
+
 STATIC ssize_t
 xfs_file_dio_read(
 	struct kiocb		*iocb,
@@ -551,7 +593,7 @@  xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 	ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +660,7 @@  xfs_file_dio_write_unaligned(
 		flags = IOMAP_DIO_FORCE_WAIT;
 	}
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 
@@ -1180,7 +1222,7 @@  xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
@@ -1328,6 +1370,7 @@  __xfs_filemap_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	vm_fault_t		ret;
+	int			lock_mode = 0;
 
 	trace_xfs_filemap_fault(ip, order, write_fault);
 
@@ -1336,25 +1379,24 @@  __xfs_filemap_fault(
 		file_update_time(vmf->vma->vm_file);
 	}
 
+	if (IS_DAX(inode) || write_fault)
+		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
-		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
-		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	} else if (write_fault) {
+		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
-		if (write_fault) {
-			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-			ret = iomap_page_mkwrite(vmf,
-					&xfs_page_mkwrite_iomap_ops);
-			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-		} else {
-			ret = filemap_fault(vmf);
-		}
+		ret = filemap_fault(vmf);
 	}
 
+	if (lock_mode)
+		xfs_iunlock(XFS_I(inode), lock_mode);
+
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..97b0078249fd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3621,6 +3621,23 @@  xfs_iunlock2_io_mmap(
 		inode_unlock(VFS_I(ip1));
 }
 
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+	if (ip1 != ip2)
+		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+	if (ip1 != ip2)
+		inode_unlock_shared(VFS_I(ip1));
+	inode_unlock(VFS_I(ip2));
+}
+
 /*
  * Reload the incore inode list for this inode.  Caller should ensure that
  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3dc47937da5d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 /* Quotacheck is running but inode has not been added to quota counts. */
 #define XFS_IQUOTAUNCHECKED	(1 << 14)
 
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING		(1U << 15)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
@@ -595,6 +603,7 @@  void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
 
 static inline bool
 xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..658edee8381d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1540,6 +1540,10 @@  xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	xfs_iflags_set(src, XFS_IREMAPPING);
+	if (inode_in != inode_out)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
 	return 0;
 out_unlock:
 	xfs_iunlock2_io_mmap(src, dest);