diff mbox series

[v4] xfs: allow read IO and FICLONE to run concurrently

Message ID 20231017201208.18127-1-catherine.hoang@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series [v4] xfs: allow read IO and FICLONE to run concurrently | expand

Commit Message

Catherine Hoang Oct. 17, 2023, 8:12 p.m. UTC
One of our VM cluster management products needs to snapshot KVM image
files so that they can be restored in case of failure. Snapshotting is
done by redirecting VM disk writes to a sidecar file and using reflink
on the disk image, specifically the FICLONE ioctl as used by
"cp --reflink". Reflink locks the source and destination files while it
operates, which means that reads from the main vm disk image are blocked,
causing the vm to stall. When an image file is heavily fragmented, the
copy process could take several minutes. Some of the vm image files have
50-100 million extent records, and duplicating that much metadata locks
the file for 30 minutes or more. Having activities suspended for such
a long time in a cluster node could result in node eviction.

Clone operations and read IO do not change any data in the source file,
so they should be able to run concurrently. Demote the exclusive locks
taken by FICLONE to shared locks to allow reads while cloning. While a
clone is in progress, writes will take the IOLOCK_EXCL, so they block
until the clone completes.

Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/xfs/xfs_file.c    | 67 +++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_inode.c   | 17 +++++++++++
 fs/xfs/xfs_inode.h   |  9 ++++++
 fs/xfs/xfs_reflink.c |  4 +++
 4 files changed, 84 insertions(+), 13 deletions(-)

Comments

Darrick J. Wong Oct. 17, 2023, 10:59 p.m. UTC | #1
On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote:
> One of our VM cluster management products needs to snapshot KVM image
> files so that they can be restored in case of failure. Snapshotting is
> done by redirecting VM disk writes to a sidecar file and using reflink
> on the disk image, specifically the FICLONE ioctl as used by
> "cp --reflink". Reflink locks the source and destination files while it
> operates, which means that reads from the main vm disk image are blocked,
> causing the vm to stall. When an image file is heavily fragmented, the
> copy process could take several minutes. Some of the vm image files have
> 50-100 million extent records, and duplicating that much metadata locks
> the file for 30 minutes or more. Having activities suspended for such
> a long time in a cluster node could result in node eviction.
> 
> Clone operations and read IO do not change any data in the source file,
> so they should be able to run concurrently. Demote the exclusive locks
> taken by FICLONE to shared locks to allow reads while cloning. While a
> clone is in progress, writes will take the IOLOCK_EXCL, so they block
> until the clone completes.
> 
> Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>

Looks good to me,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_file.c    | 67 +++++++++++++++++++++++++++++++++++---------
>  fs/xfs/xfs_inode.c   | 17 +++++++++++
>  fs/xfs/xfs_inode.h   |  9 ++++++
>  fs/xfs/xfs_reflink.c |  4 +++
>  4 files changed, 84 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..3b9500e18f90 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -214,6 +214,47 @@ xfs_ilock_iocb(
>  	return 0;
>  }
>  
> +static int
> +xfs_ilock_iocb_for_write(
> +	struct kiocb		*iocb,
> +	unsigned int		*lock_mode)
> +{
> +	ssize_t			ret;
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +
> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
> +	if (ret)
> +		return ret;
> +
> +	if (*lock_mode == XFS_IOLOCK_EXCL)
> +		return 0;
> +	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
> +		return 0;
> +
> +	xfs_iunlock(ip, *lock_mode);
> +	*lock_mode = XFS_IOLOCK_EXCL;
> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +static unsigned int
> +xfs_ilock_for_write_fault(
> +	struct xfs_inode	*ip)
> +{
> +	/* get a shared lock if no remapping in progress */
> +	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
> +	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
> +		return XFS_MMAPLOCK_SHARED;
> +
> +	/* wait for remapping to complete */
> +	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
> +	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> +	return XFS_MMAPLOCK_EXCL;
> +}
> +
>  STATIC ssize_t
>  xfs_file_dio_read(
>  	struct kiocb		*iocb,
> @@ -551,7 +592,7 @@ xfs_file_dio_write_aligned(
>  	unsigned int		iolock = XFS_IOLOCK_SHARED;
>  	ssize_t			ret;
>  
> -	ret = xfs_ilock_iocb(iocb, iolock);
> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>  	if (ret)
>  		return ret;
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
> @@ -618,7 +659,7 @@ xfs_file_dio_write_unaligned(
>  		flags = IOMAP_DIO_FORCE_WAIT;
>  	}
>  
> -	ret = xfs_ilock_iocb(iocb, iolock);
> +	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>  	if (ret)
>  		return ret;
>  
> @@ -1180,7 +1221,7 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iunlock2_remapping(src, dest);
>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> @@ -1328,6 +1369,7 @@ __xfs_filemap_fault(
>  	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	vm_fault_t		ret;
> +	unsigned int		lock_mode = 0;
>  
>  	trace_xfs_filemap_fault(ip, order, write_fault);
>  
> @@ -1336,25 +1378,24 @@ __xfs_filemap_fault(
>  		file_update_time(vmf->vma->vm_file);
>  	}
>  
> +	if (IS_DAX(inode) || write_fault)
> +		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
> +
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> -		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, order, pfn);
> -		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +	} else if (write_fault) {
> +		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
>  	} else {
> -		if (write_fault) {
> -			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -			ret = iomap_page_mkwrite(vmf,
> -					&xfs_page_mkwrite_iomap_ops);
> -			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -		} else {
> -			ret = filemap_fault(vmf);
> -		}
> +		ret = filemap_fault(vmf);
>  	}
>  
> +	if (lock_mode)
> +		xfs_iunlock(XFS_I(inode), lock_mode);
> +
>  	if (write_fault)
>  		sb_end_pagefault(inode->i_sb);
>  	return ret;
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4d55f58d99b7..97b0078249fd 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3621,6 +3621,23 @@ xfs_iunlock2_io_mmap(
>  		inode_unlock(VFS_I(ip1));
>  }
>  
> +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
> +void
> +xfs_iunlock2_remapping(
> +	struct xfs_inode	*ip1,
> +	struct xfs_inode	*ip2)
> +{
> +	xfs_iflags_clear(ip1, XFS_IREMAPPING);
> +
> +	if (ip1 != ip2)
> +		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
> +	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
> +
> +	if (ip1 != ip2)
> +		inode_unlock_shared(VFS_I(ip1));
> +	inode_unlock(VFS_I(ip2));
> +}
> +
>  /*
>   * Reload the incore inode list for this inode.  Caller should ensure that
>   * the link count cannot change, either by taking ILOCK_SHARED or otherwise
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..3dc47937da5d 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  /* Quotacheck is running but inode has not been added to quota counts. */
>  #define XFS_IQUOTAUNCHECKED	(1 << 14)
>  
> +/*
> + * Remap in progress. Callers that wish to update file data while
> + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
> + * the lock in exclusive mode. Relocking the file will block until
> + * IREMAPPING is cleared.
> + */
> +#define XFS_IREMAPPING		(1U << 15)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  
>  static inline bool
>  xfs_inode_unlinked_incomplete(
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index eb9102453aff..658edee8381d 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
>  	if (ret)
>  		goto out_unlock;
>  
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +	if (inode_in != inode_out)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +
>  	return 0;
>  out_unlock:
>  	xfs_iunlock2_io_mmap(src, dest);
> -- 
> 2.34.1
>
Dave Chinner Oct. 17, 2023, 11:59 p.m. UTC | #2
On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote:
> One of our VM cluster management products needs to snapshot KVM image
> files so that they can be restored in case of failure. Snapshotting is
> done by redirecting VM disk writes to a sidecar file and using reflink
> on the disk image, specifically the FICLONE ioctl as used by
> "cp --reflink". Reflink locks the source and destination files while it
> operates, which means that reads from the main vm disk image are blocked,
> causing the vm to stall. When an image file is heavily fragmented, the
> copy process could take several minutes. Some of the vm image files have
> 50-100 million extent records, and duplicating that much metadata locks
> the file for 30 minutes or more. Having activities suspended for such
> a long time in a cluster node could result in node eviction.
> 
> Clone operations and read IO do not change any data in the source file,
> so they should be able to run concurrently. Demote the exclusive locks
> taken by FICLONE to shared locks to allow reads while cloning. While a
> clone is in progress, writes will take the IOLOCK_EXCL, so they block
> until the clone completes.
> 
> Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/xfs/xfs_file.c    | 67 +++++++++++++++++++++++++++++++++++---------
>  fs/xfs/xfs_inode.c   | 17 +++++++++++
>  fs/xfs/xfs_inode.h   |  9 ++++++
>  fs/xfs/xfs_reflink.c |  4 +++
>  4 files changed, 84 insertions(+), 13 deletions(-)

All looks OK - one minor nit below.

Reviewed-by: Dave Chinner <dchinner@redhat.com>

> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..3b9500e18f90 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -214,6 +214,47 @@ xfs_ilock_iocb(
>  	return 0;
>  }
>  
> +static int
> +xfs_ilock_iocb_for_write(
> +	struct kiocb		*iocb,
> +	unsigned int		*lock_mode)
> +{
> +	ssize_t			ret;
> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +
> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
> +	if (ret)
> +		return ret;
> +
> +	if (*lock_mode == XFS_IOLOCK_EXCL)
> +		return 0;
> +	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
> +		return 0;
> +
> +	xfs_iunlock(ip, *lock_mode);
> +	*lock_mode = XFS_IOLOCK_EXCL;
> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
> +	if (ret)
> +		return ret;
> +
> +	return 0;

This last bit could simply be:

	xfs_iunlock(ip, *lock_mode);
	*lock_mode = XFS_IOLOCK_EXCL;
	return xfs_ilock_iocb(iocb, *lock_mode);

Cheers,

Dave.
Christoph Hellwig Oct. 18, 2023, 6:08 a.m. UTC | #3
On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote:
> One of our VM cluster management products needs to snapshot KVM image
> files so that they can be restored in case of failure. Snapshotting is
> done by redirecting VM disk writes to a sidecar file and using reflink
> on the disk image, specifically the FICLONE ioctl as used by
> "cp --reflink". Reflink locks the source and destination files while it
> operates, which means that reads from the main vm disk image are blocked,
> causing the vm to stall. When an image file is heavily fragmented, the
> copy process could take several minutes. Some of the vm image files have
> 50-100 million extent records, and duplicating that much metadata locks
> the file for 30 minutes or more. Having activities suspended for such
> a long time in a cluster node could result in node eviction.
> 
> Clone operations and read IO do not change any data in the source file,
> so they should be able to run concurrently. Demote the exclusive locks
> taken by FICLONE to shared locks to allow reads while cloning. While a
> clone is in progress, writes will take the IOLOCK_EXCL, so they block
> until the clone completes.

Sorry for being pesky, but do you have some rough numbers on how
much this actually with the above workload?

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Darrick J. Wong Oct. 19, 2023, 8:04 p.m. UTC | #4
On Tue, Oct 17, 2023 at 11:08:14PM -0700, Christoph Hellwig wrote:
> On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote:
> > One of our VM cluster management products needs to snapshot KVM image
> > files so that they can be restored in case of failure. Snapshotting is
> > done by redirecting VM disk writes to a sidecar file and using reflink
> > on the disk image, specifically the FICLONE ioctl as used by
> > "cp --reflink". Reflink locks the source and destination files while it
> > operates, which means that reads from the main vm disk image are blocked,
> > causing the vm to stall. When an image file is heavily fragmented, the
> > copy process could take several minutes. Some of the vm image files have
> > 50-100 million extent records, and duplicating that much metadata locks
> > the file for 30 minutes or more. Having activities suspended for such
> > a long time in a cluster node could result in node eviction.
> > 
> > Clone operations and read IO do not change any data in the source file,
> > so they should be able to run concurrently. Demote the exclusive locks
> > taken by FICLONE to shared locks to allow reads while cloning. While a
> > clone is in progress, writes will take the IOLOCK_EXCL, so they block
> > until the clone completes.
> 
> Sorry for being pesky, but do you have some rough numbers on how
> much this actually with the above workload?

Well... the stupid answer is that I augmented generic/176 to try to race
buffered and direct reads with cloning a million extents and print out
when the racing reads completed.  On an unpatched kernel, the reads
don't complete until the reflink does:

--- /tmp/fstests/tests/generic/176.out  2023-07-11 12:18:21.617971250 -0700
+++ /var/tmp/fstests/generic/176.out.bad        2023-10-19 10:22:04.771017812 -0700
@@ -2,3 +2,8 @@
 Format and mount
 Create a many-block file
 Reflink the big file
+start reflink Thu Oct 19 10:19:19 PDT 2023
+end reflink Thu Oct 19 10:20:06 PDT 2023
+buffered read ioend Thu Oct 19 10:20:06 PDT 2023
+direct read ioend Thu Oct 19 10:20:06 PDT 2023
+finished waiting Thu Oct 19 10:20:06 PDT 2023

Yowza, a minute's worth of read latency!  On a patched kernel, the reads
complete while the clone is running:

--- /tmp/fstests/tests/generic/176.out  2023-07-11 12:18:21.617971250 -0700
+++ /var/tmp/fstests/generic/176.out.bad        2023-10-19 10:22:25.528685643 -0700
@@ -2,3 +2,552 @@
 Format and mount
 Create a many-block file
 Reflink the big file
+start reflink Thu Oct 19 10:19:24 PDT 2023
+buffered read ioend Thu Oct 19 10:19:24 PDT 2023
+direct read ioend Thu Oct 19 10:19:24 PDT 2023
+buffered read ioend Thu Oct 19 10:19:24 PDT 2023
+direct read ioend Thu Oct 19 10:19:24 PDT 2023
+buffered read ioend Thu Oct 19 10:19:24 PDT 2023
+buffered read ioend Thu Oct 19 10:19:24 PDT 2023
+buffered read ioend Thu Oct 19 10:19:25 PDT 2023
+buffered read ioend Thu Oct 19 10:19:25 PDT 2023
+direct read ioend Thu Oct 19 10:19:25 PDT 2023
...
+buffered read ioend Thu Oct 19 10:20:06 PDT 2023
+buffered read ioend Thu Oct 19 10:20:07 PDT 2023
+buffered read ioend Thu Oct 19 10:20:07 PDT 2023
+direct read ioend Thu Oct 19 10:20:07 PDT 2023
+buffered read ioend Thu Oct 19 10:20:07 PDT 2023
+buffered read ioend Thu Oct 19 10:20:07 PDT 2023
+buffered read ioend Thu Oct 19 10:20:07 PDT 2023
+end reflink Thu Oct 19 10:20:07 PDT 2023
+direct read ioend Thu Oct 19 10:20:07 PDT 2023
+finished waiting Thu Oct 19 10:20:07 PDT 2023

So as you can see, reads from the reflink source file no longer
experience a giant latency spike.  I also wrote an fstest to check this
behavior; I'll attach it as a separate reply.

> Otherwise looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thanks!

--D
Christoph Hellwig Oct. 20, 2023, 6:06 a.m. UTC | #5
On Thu, Oct 19, 2023 at 01:04:11PM -0700, Darrick J. Wong wrote:
> Well... the stupid answer is that I augmented generic/176 to try to race
> buffered and direct reads with cloning a million extents and print out
> when the racing reads completed.  On an unpatched kernel, the reads
> don't complete until the reflink does:

> So as you can see, reads from the reflink source file no longer
> experience a giant latency spike.  I also wrote an fstest to check this
> behavior; I'll attach it as a separate reply.

Nice.  I guess write latency doesn't really matter for this use
case?
Darrick J. Wong Oct. 20, 2023, 3:34 p.m. UTC | #6
On Thu, Oct 19, 2023 at 11:06:42PM -0700, Christoph Hellwig wrote:
> On Thu, Oct 19, 2023 at 01:04:11PM -0700, Darrick J. Wong wrote:
> > Well... the stupid answer is that I augmented generic/176 to try to race
> > buffered and direct reads with cloning a million extents and print out
> > when the racing reads completed.  On an unpatched kernel, the reads
> > don't complete until the reflink does:
> 
> > So as you can see, reads from the reflink source file no longer
> > experience a giant latency spike.  I also wrote an fstest to check this
> > behavior; I'll attach it as a separate reply.
> 
> Nice.  I guess write latency doesn't really matter for this use
> case?

Nope -- they've gotten libvirt to tell qemu to redirect vm disk writes
to a new sidecar file.  Then they reflink the original source file to
the backup file, but they want qemu to be able to service reads from
that original source file while the reflink is ongoing.  When the backup
is done, they commit the sidecar contents back into the original image.

It would be kinda neat if we had file range locks.  Regular progress
could shorten the range as it makes progress.  If the thread doing the
reflink could find out that another thread has blocked on part of the
file range, it could even hurry up and clone that part so that neither
reads nor writes would see enormous latency spikes.

Even better, we could actually support concurrent reads and writes to
the page cache as long as the ranges don't overlap.  But that's all
speculative until Dave dumps his old ranged lock patchset on the list.

--D
Dave Chinner Oct. 22, 2023, 10:42 p.m. UTC | #7
On Fri, Oct 20, 2023 at 08:34:48AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 19, 2023 at 11:06:42PM -0700, Christoph Hellwig wrote:
> > On Thu, Oct 19, 2023 at 01:04:11PM -0700, Darrick J. Wong wrote:
> > > Well... the stupid answer is that I augmented generic/176 to try to race
> > > buffered and direct reads with cloning a million extents and print out
> > > when the racing reads completed.  On an unpatched kernel, the reads
> > > don't complete until the reflink does:
> > 
> > > So as you can see, reads from the reflink source file no longer
> > > experience a giant latency spike.  I also wrote an fstest to check this
> > > behavior; I'll attach it as a separate reply.
> > 
> > Nice.  I guess write latency doesn't really matter for this use
> > case?
> 
> Nope -- they've gotten libvirt to tell qemu to redirect vm disk writes
> to a new sidecar file.  Then they reflink the original source file to
> the backup file, but they want qemu to be able to service reads from
> that original source file while the reflink is ongoing.  When the backup
> is done, they commit the sidecar contents back into the original image.
> 
> It would be kinda neat if we had file range locks.  Regular progress
> could shorten the range as it makes progress.  If the thread doing the
> reflink could find out that another thread has blocked on part of the
> file range, it could even hurry up and clone that part so that neither
> reads nor writes would see enormous latency spikes.
> 
> Even better, we could actually support concurrent reads and writes to
> the page cache as long as the ranges don't overlap.  But that's all
> speculative until Dave dumps his old ranged lock patchset on the list.

The unfortunate reality is that range locks as I was trying to
implement them didn't scale - it was a failed experiment.

The issue is the internal tracking structure of a range lock. It has
to be concurrency safe itself, and even with lockless tree
structures using per-node seqlocks for internal sequencing, they
still rely on atomic ops for safe concurrent access and updates.

Hence the best I could get out of an uncontended range lock (i.e.
locking different exclusive ranges concurrently) was about 400,000
lock/unlock operations per second before the internal tracking
structure broke down under concurrent modification pressure.  That
was a whole lot better than previous attempts that topped out at
~150,000 lock/unlock ops/s, but it's still far short of the ~3
million concurrent shared lock/unlock ops/s than a rwsem could do on
that same machine.

Worse for range locks was that once passed peak performance,
internal contention within the range lock caused performance to fall
off a cliff and ends up being much worse than just using pure
exclusive locking with a mutex.

Hence without some novel new internal lockless and memory allocation
free tracking structure and algorithm, range locks will suck for the
one thing we want them for: high performance, highly concurrent
access to discrete ranges of a single file.

-Dave.
Chandan Babu R Oct. 23, 2023, 6:42 a.m. UTC | #8
On Wed, Oct 18, 2023 at 10:59:10 AM +1100, Dave Chinner wrote:
> On Tue, Oct 17, 2023 at 01:12:08PM -0700, Catherine Hoang wrote:
>> One of our VM cluster management products needs to snapshot KVM image
>> files so that they can be restored in case of failure. Snapshotting is
>> done by redirecting VM disk writes to a sidecar file and using reflink
>> on the disk image, specifically the FICLONE ioctl as used by
>> "cp --reflink". Reflink locks the source and destination files while it
>> operates, which means that reads from the main vm disk image are blocked,
>> causing the vm to stall. When an image file is heavily fragmented, the
>> copy process could take several minutes. Some of the vm image files have
>> 50-100 million extent records, and duplicating that much metadata locks
>> the file for 30 minutes or more. Having activities suspended for such
>> a long time in a cluster node could result in node eviction.
>> 
>> Clone operations and read IO do not change any data in the source file,
>> so they should be able to run concurrently. Demote the exclusive locks
>> taken by FICLONE to shared locks to allow reads while cloning. While a
>> clone is in progress, writes will take the IOLOCK_EXCL, so they block
>> until the clone completes.
>> 
>> Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/
>> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
>> ---
>>  fs/xfs/xfs_file.c    | 67 +++++++++++++++++++++++++++++++++++---------
>>  fs/xfs/xfs_inode.c   | 17 +++++++++++
>>  fs/xfs/xfs_inode.h   |  9 ++++++
>>  fs/xfs/xfs_reflink.c |  4 +++
>>  4 files changed, 84 insertions(+), 13 deletions(-)
>
> All looks OK - one minor nit below.
>
> Reviewed-by: Dave Chinner <dchinner@redhat.com>
>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 203700278ddb..3b9500e18f90 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -214,6 +214,47 @@ xfs_ilock_iocb(
>>  	return 0;
>>  }
>>  
>> +static int
>> +xfs_ilock_iocb_for_write(
>> +	struct kiocb		*iocb,
>> +	unsigned int		*lock_mode)
>> +{
>> +	ssize_t			ret;
>> +	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
>> +
>> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (*lock_mode == XFS_IOLOCK_EXCL)
>> +		return 0;
>> +	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
>> +		return 0;
>> +
>> +	xfs_iunlock(ip, *lock_mode);
>> +	*lock_mode = XFS_IOLOCK_EXCL;
>> +	ret = xfs_ilock_iocb(iocb, *lock_mode);
>> +	if (ret)
>> +		return ret;
>> +
>> +	return 0;
>
> This last bit could simply be:
>
> 	xfs_iunlock(ip, *lock_mode);
> 	*lock_mode = XFS_IOLOCK_EXCL;
> 	return xfs_ilock_iocb(iocb, *lock_mode);
>

Catherine, I have made the modifications suggested above and I have committed
the patch onto my local Git tree.
Darrick J. Wong Oct. 23, 2023, 3:40 p.m. UTC | #9
On Mon, Oct 23, 2023 at 09:42:59AM +1100, Dave Chinner wrote:
> On Fri, Oct 20, 2023 at 08:34:48AM -0700, Darrick J. Wong wrote:
> > On Thu, Oct 19, 2023 at 11:06:42PM -0700, Christoph Hellwig wrote:
> > > On Thu, Oct 19, 2023 at 01:04:11PM -0700, Darrick J. Wong wrote:
> > > > Well... the stupid answer is that I augmented generic/176 to try to race
> > > > buffered and direct reads with cloning a million extents and print out
> > > > when the racing reads completed.  On an unpatched kernel, the reads
> > > > don't complete until the reflink does:
> > > 
> > > > So as you can see, reads from the reflink source file no longer
> > > > experience a giant latency spike.  I also wrote an fstest to check this
> > > > behavior; I'll attach it as a separate reply.
> > > 
> > > Nice.  I guess write latency doesn't really matter for this use
> > > case?
> > 
> > Nope -- they've gotten libvirt to tell qemu to redirect vm disk writes
> > to a new sidecar file.  Then they reflink the original source file to
> > the backup file, but they want qemu to be able to service reads from
> > that original source file while the reflink is ongoing.  When the backup
> > is done, they commit the sidecar contents back into the original image.
> > 
> > It would be kinda neat if we had file range locks.  Regular progress
> > could shorten the range as it makes progress.  If the thread doing the
> > reflink could find out that another thread has blocked on part of the
> > file range, it could even hurry up and clone that part so that neither
> > reads nor writes would see enormous latency spikes.
> > 
> > Even better, we could actually support concurrent reads and writes to
> > the page cache as long as the ranges don't overlap.  But that's all
> > speculative until Dave dumps his old ranged lock patchset on the list.
> 
> The unfortunate reality is that range locks as I was trying to
> implement them didn't scale - it was a failed experiment.
> 
> The issue is the internal tracking structure of a range lock. It has
> to be concurrency safe itself, and even with lockless tree
> structures using per-node seqlocks for internal sequencing, they
> still rely on atomic ops for safe concurrent access and updates.
> 
> Hence the best I could get out of an uncontended range lock (i.e.
> locking different exclusive ranges concurrently) was about 400,000
> lock/unlock operations per second before the internal tracking
> structure broke down under concurrent modification pressure.  That
> was a whole lot better than previous attempts that topped out at
> ~150,000 lock/unlock ops/s, but it's still far short of the ~3
> million concurrent shared lock/unlock ops/s than a rwsem could do on
> that same machine.
> 
> Worse for range locks was that once passed peak performance,
> internal contention within the range lock caused performance to fall
> off a cliff and ends up being much worse than just using pure
> exclusive locking with a mutex.
> 
> Hence without some novel new internal lockless and memory allocation
> free tracking structure and algorithm, range locks will suck for the
> one thing we want them for: high performance, highly concurrent
> access to discrete ranges of a single file.

Ah.  Thanks for the reminder about that.

--D

> -Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..3b9500e18f90 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,47 @@  xfs_ilock_iocb(
 	return 0;
 }
 
+static int
+xfs_ilock_iocb_for_write(
+	struct kiocb		*iocb,
+	unsigned int		*lock_mode)
+{
+	ssize_t			ret;
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	ret = xfs_ilock_iocb(iocb, *lock_mode);
+	if (ret)
+		return ret;
+
+	if (*lock_mode == XFS_IOLOCK_EXCL)
+		return 0;
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return 0;
+
+	xfs_iunlock(ip, *lock_mode);
+	*lock_mode = XFS_IOLOCK_EXCL;
+	ret = xfs_ilock_iocb(iocb, *lock_mode);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+	struct xfs_inode	*ip)
+{
+	/* get a shared lock if no remapping in progress */
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return XFS_MMAPLOCK_SHARED;
+
+	/* wait for remapping to complete */
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	return XFS_MMAPLOCK_EXCL;
+}
+
 STATIC ssize_t
 xfs_file_dio_read(
 	struct kiocb		*iocb,
@@ -551,7 +592,7 @@  xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 	ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +659,7 @@  xfs_file_dio_write_unaligned(
 		flags = IOMAP_DIO_FORCE_WAIT;
 	}
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 
@@ -1180,7 +1221,7 @@  xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
@@ -1328,6 +1369,7 @@  __xfs_filemap_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	vm_fault_t		ret;
+	unsigned int		lock_mode = 0;
 
 	trace_xfs_filemap_fault(ip, order, write_fault);
 
@@ -1336,25 +1378,24 @@  __xfs_filemap_fault(
 		file_update_time(vmf->vma->vm_file);
 	}
 
+	if (IS_DAX(inode) || write_fault)
+		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
-		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
-		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	} else if (write_fault) {
+		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
-		if (write_fault) {
-			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-			ret = iomap_page_mkwrite(vmf,
-					&xfs_page_mkwrite_iomap_ops);
-			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-		} else {
-			ret = filemap_fault(vmf);
-		}
+		ret = filemap_fault(vmf);
 	}
 
+	if (lock_mode)
+		xfs_iunlock(XFS_I(inode), lock_mode);
+
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..97b0078249fd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3621,6 +3621,23 @@  xfs_iunlock2_io_mmap(
 		inode_unlock(VFS_I(ip1));
 }
 
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+	if (ip1 != ip2)
+		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+	if (ip1 != ip2)
+		inode_unlock_shared(VFS_I(ip1));
+	inode_unlock(VFS_I(ip2));
+}
+
 /*
  * Reload the incore inode list for this inode.  Caller should ensure that
  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3dc47937da5d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 /* Quotacheck is running but inode has not been added to quota counts. */
 #define XFS_IQUOTAUNCHECKED	(1 << 14)
 
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING		(1U << 15)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
@@ -595,6 +603,7 @@  void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
 
 static inline bool
 xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..658edee8381d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1540,6 +1540,10 @@  xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	xfs_iflags_set(src, XFS_IREMAPPING);
+	if (inode_in != inode_out)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
 	return 0;
 out_unlock:
 	xfs_iunlock2_io_mmap(src, dest);