diff mbox series

[v2] xfs: allow read IO and FICLONE to run concurrently

Message ID 20231009205936.38644-1-catherine.hoang@oracle.com (mailing list archive)
State Superseded
Headers show
Series [v2] xfs: allow read IO and FICLONE to run concurrently | expand

Commit Message

Catherine Hoang Oct. 9, 2023, 8:59 p.m. UTC
Clone operations and read IO do not change any data in the source file, so they
should be able to run concurrently. Demote the exclusive locks taken by FICLONE
to shared locks to allow reads while cloning. While a clone is in progress,
writes will take the IOLOCK_EXCL, so they block until the clone completes.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/xfs/xfs_file.c    | 41 +++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_inode.h   |  8 ++++++++
 fs/xfs/xfs_reflink.c | 19 +++++++++++++++++++
 fs/xfs/xfs_reflink.h |  2 ++
 4 files changed, 64 insertions(+), 6 deletions(-)

Comments

Darrick J. Wong Oct. 9, 2023, 9:17 p.m. UTC | #1
On Mon, Oct 09, 2023 at 01:59:36PM -0700, Catherine Hoang wrote:
> Clone operations and read IO do not change any data in the source file, so they
> should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> to shared locks to allow reads while cloning. While a clone is in progress,
> writes will take the IOLOCK_EXCL, so they block until the clone completes.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/xfs/xfs_file.c    | 41 +++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_inode.h   |  8 ++++++++
>  fs/xfs/xfs_reflink.c | 19 +++++++++++++++++++
>  fs/xfs/xfs_reflink.h |  2 ++
>  4 files changed, 64 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..5bfb2013366f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -554,6 +554,15 @@ xfs_file_dio_write_aligned(
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> +
> +	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out_unlock;
> @@ -563,7 +572,7 @@ xfs_file_dio_write_aligned(
>  	 * the iolock back to shared if we had to take the exclusive lock in
>  	 * xfs_file_write_checks() for other reasons.
>  	 */
> -	if (iolock == XFS_IOLOCK_EXCL) {
> +	if (iolock == XFS_IOLOCK_EXCL && !xfs_iflags_test(ip, XFS_IREMAPPING)) {
>  		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
>  		iolock = XFS_IOLOCK_SHARED;

If we hold IOLOCK_EXCL on an aligned write and we think we can demote
the lock to IOLOCK_SHARED, why do we need to test IREMAPPING?

The direct writer holds IOLOCK_EXCL, so there cannot be any remapping
operations in progress on this file at all.

OTOH if there are remapping operations holding the IOLOCK in any mode,
then there cannot be any direct writers holding IOLOCK_EXCL.

There can't be any xfs_file_dio_write_aligned threads holding
IOLOCK_SHARED racing with reflink because either the direct writer sees
IREMAPPING and has to drop IOLOCK_SHARED to get IOLOCK_EXCL; or the
direct write is proceeding while holding IOLOCK_SHARED, which prevents
reflink from starting up.

So, uh, what is the IREMAPPING test here protecting against?

(Did I miss some other conflict between reflink and the dio count?
That's entirely possible...)

>  	}
> @@ -622,6 +631,14 @@ xfs_file_dio_write_unaligned(
>  	if (ret)
>  		return ret;
>  
> +	if (iolock != XFS_IOLOCK_EXCL && xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	/*
>  	 * We can't properly handle unaligned direct I/O to reflink files yet,
>  	 * as we can't unshare a partial block.
> @@ -1180,7 +1197,8 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iflags_clear(src, XFS_IREMAPPING);
> +	xfs_reflink_unlock(src, dest);
>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> @@ -1328,6 +1346,7 @@ __xfs_filemap_fault(
>  	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	vm_fault_t		ret;
> +	uint                    mmaplock = XFS_MMAPLOCK_SHARED;
>  
>  	trace_xfs_filemap_fault(ip, order, write_fault);
>  
> @@ -1339,17 +1358,27 @@ __xfs_filemap_fault(
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> -		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_ilock(XFS_I(inode), mmaplock);
> +		if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +			xfs_iunlock(ip, mmaplock);
> +			mmaplock = XFS_MMAPLOCK_EXCL;
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +		}
>  		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, order, pfn);
> -		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_iunlock(XFS_I(inode), mmaplock);
>  	} else {
>  		if (write_fault) {
> -			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +			if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +				xfs_iunlock(ip, mmaplock);
> +				mmaplock = XFS_MMAPLOCK_EXCL;
> +				xfs_ilock(XFS_I(inode), mmaplock);
> +			}
>  			ret = iomap_page_mkwrite(vmf,
>  					&xfs_page_mkwrite_iomap_ops);
> -			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_iunlock(XFS_I(inode), mmaplock);
>  		} else {
>  			ret = filemap_fault(vmf);
>  		}
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..3046ddfa2358 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  /* Quotacheck is running but inode has not been added to quota counts. */
>  #define XFS_IQUOTAUNCHECKED	(1 << 14)
>  
> +/*
> + * Remap in progress. Callers that wish to update file data while
> + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
> + * the lock in exclusive mode. Relocking the file will block until
> + * IREMAPPING is cleared.
> + */
> +#define XFS_IREMAPPING		(1U << 15)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index eb9102453aff..26cbf99061b0 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
>  	if (ret)
>  		goto out_unlock;
>  
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +	if (inode_in != inode_out)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +
>  	return 0;
>  out_unlock:
>  	xfs_iunlock2_io_mmap(src, dest);
> @@ -1718,3 +1722,18 @@ xfs_reflink_unshare(
>  	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
>  	return error;
>  }
> +
> +/* Unlock both inodes after the reflink completes. */

This comment should be more specific about what it unlocks:

/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */

--D

> +void
> +xfs_reflink_unlock(
> +	struct xfs_inode	*ip1,
> +	struct xfs_inode	*ip2)
> +{
> +	if (ip1 != ip2)
> +		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
> +	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
> +
> +	if (ip1 != ip2)
> +		inode_unlock_shared(VFS_I(ip1));
> +	inode_unlock(VFS_I(ip2));
> +}
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 65c5dfe17ecf..89f4d2a2f52e 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -53,4 +53,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
>  extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
>  		xfs_extlen_t cowextsize, unsigned int remap_flags);
>  
> +void xfs_reflink_unlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +
>  #endif /* __XFS_REFLINK_H */
> -- 
> 2.34.1
>
Dave Chinner Oct. 9, 2023, 9:39 p.m. UTC | #2
On Mon, Oct 09, 2023 at 01:59:36PM -0700, Catherine Hoang wrote:
> Clone operations and read IO do not change any data in the source file, so they
> should be able to run concurrently. Demote the exclusive locks taken by FICLONE
> to shared locks to allow reads while cloning. While a clone is in progress,
> writes will take the IOLOCK_EXCL, so they block until the clone completes.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/xfs/xfs_file.c    | 41 +++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_inode.h   |  8 ++++++++
>  fs/xfs/xfs_reflink.c | 19 +++++++++++++++++++
>  fs/xfs/xfs_reflink.h |  2 ++
>  4 files changed, 64 insertions(+), 6 deletions(-)

This has turned out pretty nice and simple :)

I have a couple of suggestions to improve it further, though.

> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 203700278ddb..5bfb2013366f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -554,6 +554,15 @@ xfs_file_dio_write_aligned(
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> +
> +	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	ret = xfs_file_write_checks(iocb, from, &iolock);
>  	if (ret)
>  		goto out_unlock;
> @@ -563,7 +572,7 @@ xfs_file_dio_write_aligned(
>  	 * the iolock back to shared if we had to take the exclusive lock in
>  	 * xfs_file_write_checks() for other reasons.
>  	 */
> -	if (iolock == XFS_IOLOCK_EXCL) {
> +	if (iolock == XFS_IOLOCK_EXCL && !xfs_iflags_test(ip, XFS_IREMAPPING)) {
>  		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
>  		iolock = XFS_IOLOCK_SHARED;
>  	}

This is unnecessary. If we already have the IOLOCK_EXCL, then a
remapping cannot be in progress and hence that flag should never be
set here.

> @@ -622,6 +631,14 @@ xfs_file_dio_write_unaligned(
>  	if (ret)
>  		return ret;
>  
> +	if (iolock != XFS_IOLOCK_EXCL && xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +		xfs_iunlock(ip, iolock);
> +		iolock = XFS_IOLOCK_EXCL;
> +		ret = xfs_ilock_iocb(iocb, iolock);
> +		if (ret)
> +			return ret;
> +	}

Rather than duplicating this, I'm wondering if we should just put
this in a wrapper - xfs_ilock_iocb_for_write(), which does:

static int
xfs_ilock_iocb_for_write(
	struct kiocb	*iocb,
	int		*lock_mode)
{
	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));

	do {
		ret = xfs_ilock_iocb(iocb, *lock_mode);
		if (ret)
			return ret;
		if (*lock_mode == XFS_IOLOCK_EXCL)
			return 0;
		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
			return 0;
		xfs_iunlock(ip, *lock_mode);
		*lock_mode = XFS_IOLOCK_EXCL;
	} while (1);
	/* notreached */
	return -EAGAIN;
}

And then we can call it from the buffered and DAX write paths, too,
so that all the write IO paths use the same locking and will handle
this case automatically if they are converted to use shared locking.

> +
>  	/*
>  	 * We can't properly handle unaligned direct I/O to reflink files yet,
>  	 * as we can't unshare a partial block.
> @@ -1180,7 +1197,8 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iflags_clear(src, XFS_IREMAPPING);
> +	xfs_reflink_unlock(src, dest);

Make that xfs_iunlock2_remapping(), and move the clearing of the
remapping bit inside it.


>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> @@ -1328,6 +1346,7 @@ __xfs_filemap_fault(
>  	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	vm_fault_t		ret;
> +	uint                    mmaplock = XFS_MMAPLOCK_SHARED;
>  
>  	trace_xfs_filemap_fault(ip, order, write_fault);
>  
> @@ -1339,17 +1358,27 @@ __xfs_filemap_fault(
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> -		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_ilock(XFS_I(inode), mmaplock);
> +		if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +			xfs_iunlock(ip, mmaplock);
> +			mmaplock = XFS_MMAPLOCK_EXCL;
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +		}
>  		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, order, pfn);
> -		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		xfs_iunlock(XFS_I(inode), mmaplock);
>  	} else {
>  		if (write_fault) {
> -			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_ilock(XFS_I(inode), mmaplock);
> +			if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
> +				xfs_iunlock(ip, mmaplock);
> +				mmaplock = XFS_MMAPLOCK_EXCL;
> +				xfs_ilock(XFS_I(inode), mmaplock);
> +			}
>  			ret = iomap_page_mkwrite(vmf,
>  					&xfs_page_mkwrite_iomap_ops);
> -			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +			xfs_iunlock(XFS_I(inode), mmaplock);

This would be better done as:

	lock_mode = 0;
	if (IS_DAX(inode) || write_fault)
		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));

	if (IS_DAX(inode)) {
		pfn_t pfn;

		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
		if (ret & VM_FAULT_NEEDDSYNC)
			ret = dax_finish_sync_fault(vmf, order, pfn);
	} else if (write_fault) {
		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
	} else {
		ret = filemap_fault(vmf);
	}

	if (lock_mode)
		xfs_iunlock(XFS_I(inode), lock_mode);
....


and

static int
xfs_ilock_for_write_fault(
	struct xfs_inode	*ip)
{
	int			lock_mode = XFS_MMAPLOCK_SHARED;

	do {
		xfs_ilock(ip, lock_mode);
		if (!xfs_iflags_test(ip, XFS_IREMAPPING))
			return lock_mode;
		if (lock_mode == XFS_MMAPLOCK_EXCL)
			return lock_mode;
		xfs_iunlock(ip, lock_mode);
		lock_mode = XFS_MMAPLOCK_SHARED;
	} while (1);
	/* notreached */
	return 0;
}

This separates out the locking from the processing of the fault
quite nicely....

> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 0c5bdb91152e..3046ddfa2358 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  /* Quotacheck is running but inode has not been added to quota counts. */
>  #define XFS_IQUOTAUNCHECKED	(1 << 14)
>  
> +/*
> + * Remap in progress. Callers that wish to update file data while
> + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
> + * the lock in exclusive mode. Relocking the file will block until
> + * IREMAPPING is cleared.
> + */
> +#define XFS_IREMAPPING		(1U << 15)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index eb9102453aff..26cbf99061b0 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep(
>  	if (ret)
>  		goto out_unlock;
>  
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +	if (inode_in != inode_out)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +
>  	return 0;
>  out_unlock:
>  	xfs_iunlock2_io_mmap(src, dest);
> @@ -1718,3 +1722,18 @@ xfs_reflink_unshare(
>  	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
>  	return error;
>  }
> +
> +/* Unlock both inodes after the reflink completes. */
> +void
> +xfs_reflink_unlock(
> +	struct xfs_inode	*ip1,
> +	struct xfs_inode	*ip2)
> +{
> +	if (ip1 != ip2)
> +		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
> +	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
> +
> +	if (ip1 != ip2)
> +		inode_unlock_shared(VFS_I(ip1));
> +	inode_unlock(VFS_I(ip2));
> +}

That should go with all the other inode locking functions in
xfs_inode.c.

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..5bfb2013366f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -554,6 +554,15 @@  xfs_file_dio_write_aligned(
 	ret = xfs_ilock_iocb(iocb, iolock);
 	if (ret)
 		return ret;
+
+	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+		xfs_iunlock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
+		ret = xfs_ilock_iocb(iocb, iolock);
+		if (ret)
+			return ret;
+	}
+
 	ret = xfs_file_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out_unlock;
@@ -563,7 +572,7 @@  xfs_file_dio_write_aligned(
 	 * the iolock back to shared if we had to take the exclusive lock in
 	 * xfs_file_write_checks() for other reasons.
 	 */
-	if (iolock == XFS_IOLOCK_EXCL) {
+	if (iolock == XFS_IOLOCK_EXCL && !xfs_iflags_test(ip, XFS_IREMAPPING)) {
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
@@ -622,6 +631,14 @@  xfs_file_dio_write_unaligned(
 	if (ret)
 		return ret;
 
+	if (iolock != XFS_IOLOCK_EXCL && xfs_iflags_test(ip, XFS_IREMAPPING)) {
+		xfs_iunlock(ip, iolock);
+		iolock = XFS_IOLOCK_EXCL;
+		ret = xfs_ilock_iocb(iocb, iolock);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * We can't properly handle unaligned direct I/O to reflink files yet,
 	 * as we can't unshare a partial block.
@@ -1180,7 +1197,8 @@  xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iflags_clear(src, XFS_IREMAPPING);
+	xfs_reflink_unlock(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
@@ -1328,6 +1346,7 @@  __xfs_filemap_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	vm_fault_t		ret;
+	uint                    mmaplock = XFS_MMAPLOCK_SHARED;
 
 	trace_xfs_filemap_fault(ip, order, write_fault);
 
@@ -1339,17 +1358,27 @@  __xfs_filemap_fault(
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
-		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+		xfs_ilock(XFS_I(inode), mmaplock);
+		if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+			xfs_iunlock(ip, mmaplock);
+			mmaplock = XFS_MMAPLOCK_EXCL;
+			xfs_ilock(XFS_I(inode), mmaplock);
+		}
 		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
-		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+		xfs_iunlock(XFS_I(inode), mmaplock);
 	} else {
 		if (write_fault) {
-			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+			xfs_ilock(XFS_I(inode), mmaplock);
+			if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+				xfs_iunlock(ip, mmaplock);
+				mmaplock = XFS_MMAPLOCK_EXCL;
+				xfs_ilock(XFS_I(inode), mmaplock);
+			}
 			ret = iomap_page_mkwrite(vmf,
 					&xfs_page_mkwrite_iomap_ops);
-			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+			xfs_iunlock(XFS_I(inode), mmaplock);
 		} else {
 			ret = filemap_fault(vmf);
 		}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3046ddfa2358 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 /* Quotacheck is running but inode has not been added to quota counts. */
 #define XFS_IQUOTAUNCHECKED	(1 << 14)
 
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING		(1U << 15)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..26cbf99061b0 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1540,6 +1540,10 @@  xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	xfs_iflags_set(src, XFS_IREMAPPING);
+	if (inode_in != inode_out)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
 	return 0;
 out_unlock:
 	xfs_iunlock2_io_mmap(src, dest);
@@ -1718,3 +1722,18 @@  xfs_reflink_unshare(
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	return error;
 }
+
+/* Unlock both inodes after the reflink completes. */
+void
+xfs_reflink_unlock(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	if (ip1 != ip2)
+		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+	if (ip1 != ip2)
+		inode_unlock_shared(VFS_I(ip1));
+	inode_unlock(VFS_I(ip2));
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 65c5dfe17ecf..89f4d2a2f52e 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -53,4 +53,6 @@  extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
 extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
 		xfs_extlen_t cowextsize, unsigned int remap_flags);
 
+void xfs_reflink_unlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
 #endif /* __XFS_REFLINK_H */