diff mbox series

[V2] xfs: make src file readable during reflink

Message ID 20220629060755.25537-1-wen.gang.wang@oracle.com (mailing list archive)
State New, archived
Headers show
Series [V2] xfs: make src file readable during reflink | expand

Commit Message

Wengang Wang June 29, 2022, 6:07 a.m. UTC
During a reflink operation, the IOLOCK and MMAPLOCK of the source file
are held in exclusive mode for the duration. This prevents reads on the
source file, which could be a very long time if the source file has
millions of extents.

As the source of copy, besides some necessary modification (say dirty page
flushing), it plays readonly role. Locking source file exclusively through
out the full reflink copy is unreasonable.

This patch downgrades exclusive locks on source file to shared modes after
page cache flushing and before cloning the extents. To avoid source file
change after lock downgradation, direct write paths take IOLOCK_EXCL on
seeing reflink copy happening to the files.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
V2 changes:
 Commit message
 Make direct write paths take IOLOCK_EXCL when reflink copy is happening
 Tiny changes
---
 fs/xfs/xfs_file.c  | 33 ++++++++++++++++++++++++++++++---
 fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h | 11 +++++++++++
 3 files changed, 72 insertions(+), 3 deletions(-)

Comments

Wengang Wang July 5, 2022, 4:34 p.m. UTC | #1
Hi,

Any thought on this?

thanks,
wengang

> On Jun 28, 2022, at 11:07 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> 
> During a reflink operation, the IOLOCK and MMAPLOCK of the source file
> are held in exclusive mode for the duration. This prevents reads on the
> source file, which could be a very long time if the source file has
> millions of extents.
> 
> As the source of copy, besides some necessary modification (say dirty page
> flushing), it plays readonly role. Locking source file exclusively through
> out the full reflink copy is unreasonable.
> 
> This patch downgrades exclusive locks on source file to shared modes after
> page cache flushing and before cloning the extents. To avoid source file
> change after lock downgradation, direct write paths take IOLOCK_EXCL on
> seeing reflink copy happening to the files.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
> V2 changes:
> Commit message
> Make direct write paths take IOLOCK_EXCL when reflink copy is happening
> Tiny changes
> ---
> fs/xfs/xfs_file.c  | 33 ++++++++++++++++++++++++++++++---
> fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++
> fs/xfs/xfs_inode.h | 11 +++++++++++
> 3 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5a171c0b244b..6ca7118ee274 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned(
> 	struct iov_iter		*from)
> {
> 	unsigned int		iolock = XFS_IOLOCK_SHARED;
> +	int			remapping;
> 	ssize_t			ret;
> 
> +relock:
> 	ret = xfs_ilock_iocb(iocb, iolock);
> 	if (ret)
> 		return ret;
> @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned(
> 	if (ret)
> 		goto out_unlock;
> 
> +	remapping = xfs_iflags_test(ip, XFS_IREMAPPING);
> +
> 	/*
> 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
> 	 * the iolock back to shared if we had to take the exclusive lock in
> 	 * xfs_file_write_checks() for other reasons.
> +	 * But take IOLOCK_EXCL when reflink copy is going on
> 	 */
> 	if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> +		if (!remapping) {
> +			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +			iolock = XFS_IOLOCK_SHARED;
> +		}
> +	} else { /* iolock == XFS_ILOCK_SHARED */
> +		if (remapping) {
> +			xfs_iunlock(ip, iolock);
> +			iolock = XFS_IOLOCK_EXCL;
> +			goto relock;
> +		}
> 	}
> 	trace_xfs_file_direct_write(iocb, from);
> 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> @@ -1125,6 +1138,19 @@ xfs_file_remap_range(
> 	if (ret || len == 0)
> 		return ret;
> 
> +	/*
> +	 * Set XFS_IREMAPPING flag to source file before we downgrade
> +	 * the locks, so that all direct writes know they have to take
> +	 * IOLOCK_EXCL.
> +	 */
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +
> +	/*
> +	 * From now on, we read only from src, so downgrade locks to allow
> +	 * read operations go.
> +	 */
> +	xfs_ilock_io_mmap_downgrade_src(src, dest);
> +
> 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
> 
> 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
> @@ -1152,7 +1178,8 @@ xfs_file_remap_range(
> 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
> 		xfs_log_force_inode(dest);
> out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iflags_clear(src, XFS_IREMAPPING);
> +	xfs_iunlock2_io_mmap_src_shared(src, dest);
> 	if (ret)
> 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
> 	return remapped > 0 ? remapped : ret;
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 52d6f2c7d58b..1cbd4a594f28 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3786,6 +3786,16 @@ xfs_ilock2_io_mmap(
> 	return 0;
> }
> 
> +/* Downgrade the locks on src file if src and dest are not the same one. */
> +void
> +xfs_ilock_io_mmap_downgrade_src(
> +	struct xfs_inode	*src,
> +	struct xfs_inode	*dest)
> +{
> +	if (src != dest)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> +}
> +
> /* Unlock both inodes to allow IO and mmap activity. */
> void
> xfs_iunlock2_io_mmap(
> @@ -3798,3 +3808,24 @@ xfs_iunlock2_io_mmap(
> 	if (ip1 != ip2)
> 		inode_unlock(VFS_I(ip1));
> }
> +
> +/*
> + * Unlock the exclusive locks on dest file.
> + * Also unlock the shared locks on src if src and dest are not the same one
> + */
> +void
> +xfs_iunlock2_io_mmap_src_shared(
> +	struct xfs_inode	*src,
> +	struct xfs_inode	*dest)
> +{
> +	struct inode	*src_inode = VFS_I(src);
> +	struct inode	*dest_inode = VFS_I(dest);
> +
> +	inode_unlock(dest_inode);
> +	filemap_invalidate_unlock(dest_inode->i_mapping);
> +	if (src == dest)
> +		return;
> +
> +	inode_unlock_shared(src_inode);
> +	filemap_invalidate_unlock_shared(src_inode->i_mapping);
> +}
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 7be6f8e705ab..c07d4b42cf9d 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>  */
> #define XFS_INACTIVATING	(1 << 13)
> 
> +/*
> + * A flag indicating reflink copy / remapping is happening to the file as
> + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid
> + * interphering the remapping.
> + */
> +#define XFS_IREMAPPING		(1 << 14)
> +
> /* All inode state flags related to inode reclaim. */
> #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
> 				 XFS_IRECLAIM | \
> @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work);
> 
> int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src,
> +					struct xfs_inode *dest);
> +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src,
> +					struct xfs_inode *dest);
> 
> #endif	/* __XFS_INODE_H__ */
> -- 
> 2.21.0 (Apple Git-122.2)
>
Darrick J. Wong July 5, 2022, 6:37 p.m. UTC | #2
On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote:
> During a reflink operation, the IOLOCK and MMAPLOCK of the source file
> are held in exclusive mode for the duration. This prevents reads on the
> source file, which could be a very long time if the source file has
> millions of extents.
> 
> As the source of copy, besides some necessary modification (say dirty page
> flushing), it plays readonly role. Locking source file exclusively through
> out the full reflink copy is unreasonable.
> 
> This patch downgrades exclusive locks on source file to shared modes after
> page cache flushing and before cloning the extents. To avoid source file
> change after lock downgradation, direct write paths take IOLOCK_EXCL on
> seeing reflink copy happening to the files.

This is going to complicate the synchronization logic between reflink
and everything else quite a bit -- right now we generally allow multiple
concurrent direct writers (IOLOCK) and write faults (MMAPLOCK) per file,
so space mapping operations (fallocate/reflink) can lock out those
writers in a simple manner.

> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
> V2 changes:
>  Commit message
>  Make direct write paths take IOLOCK_EXCL when reflink copy is happening
>  Tiny changes
> ---
>  fs/xfs/xfs_file.c  | 33 ++++++++++++++++++++++++++++++---
>  fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++
>  fs/xfs/xfs_inode.h | 11 +++++++++++
>  3 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5a171c0b244b..6ca7118ee274 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned(
>  	struct iov_iter		*from)
>  {
>  	unsigned int		iolock = XFS_IOLOCK_SHARED;
> +	int			remapping;

bool?

>  	ssize_t			ret;
>  
> +relock:
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned(
>  	if (ret)
>  		goto out_unlock;
>  
> +	remapping = xfs_iflags_test(ip, XFS_IREMAPPING);

remapping = xfs_has_reflink(mp) && xfs_iflags_test(ip, XFS_IREMAPPING);

so that you can skip the locked test on filesystems where remapping
isn't possible.

> +
>  	/*
>  	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
>  	 * the iolock back to shared if we had to take the exclusive lock in
>  	 * xfs_file_write_checks() for other reasons.
> +	 * But take IOLOCK_EXCL when reflink copy is going on
>  	 */
>  	if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> +		if (!remapping) {
> +			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +			iolock = XFS_IOLOCK_SHARED;
> +		}

Hm.  So the logic in the IOLOCK_EXCL case is that if a directio write
takes IOLOCK_EXCL, there can't possibly be a remap operation running.
Remap operations themselves always start by taking IOLOCK_EXCL before
setting IREMAPPING, so a remap operation cannot set IREMAPPING until
after this directio completes.

In the IOLOCK_SHARED case below, the directio upgrades to IOLOCK_EXCL if
a remap operation is detected.  We're protected against IREMAPPING
getting set while we hold IOLOCK_SHARED (because remap operations start
by taking IOLOCK_EXCL), though in theory we could race with the end of a
remapping operation, which at worst will result in an unnecessary
IOLOCK_EXCL acquisition, right?

There can only be one remapping operation in progress at a time because
they will take IOLOCK_EXCL initially and demote to _SHARED, so there
shouldn't be any races to setting and clearing IREMAPPING.

So I /think/ this works, but concurrency is hard to think about. :/

> +	} else { /* iolock == XFS_ILOCK_SHARED */

IOLOCK_SHARED, not ILOCK_SHARED?

> +		if (remapping) {
> +			xfs_iunlock(ip, iolock);
> +			iolock = XFS_IOLOCK_EXCL;
> +			goto relock;
> +		}
>  	}
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> @@ -1125,6 +1138,19 @@ xfs_file_remap_range(

Aren't changes necessary for xfs_file_dio_write_unaligned too?

>  	if (ret || len == 0)
>  		return ret;
>  
> +	/*
> +	 * Set XFS_IREMAPPING flag to source file before we downgrade
> +	 * the locks, so that all direct writes know they have to take
> +	 * IOLOCK_EXCL.
> +	 */
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +
> +	/*
> +	 * From now on, we read only from src, so downgrade locks to allow
> +	 * read operations go.
> +	 */
> +	xfs_ilock_io_mmap_downgrade_src(src, dest);
> +
>  	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
>  
>  	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
> @@ -1152,7 +1178,8 @@ xfs_file_remap_range(
>  	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
>  		xfs_log_force_inode(dest);
>  out_unlock:
> -	xfs_iunlock2_io_mmap(src, dest);
> +	xfs_iflags_clear(src, XFS_IREMAPPING);
> +	xfs_iunlock2_io_mmap_src_shared(src, dest);
>  	if (ret)
>  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
>  	return remapped > 0 ? remapped : ret;
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 52d6f2c7d58b..1cbd4a594f28 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3786,6 +3786,16 @@ xfs_ilock2_io_mmap(
>  	return 0;
>  }
>  
> +/* Downgrade the locks on src file if src and dest are not the same one. */
> +void
> +xfs_ilock_io_mmap_downgrade_src(
> +	struct xfs_inode	*src,
> +	struct xfs_inode	*dest)
> +{
> +	if (src != dest)
> +		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);

Oh, you're downgrading MMAPLOCK_EXCL (aka invalidate_lock) too?

That's going to be tricky to figure out -- write page faults (and
apparently all DAX faults) take MMAPLOCK_SHARED (invalidate_lock_shared)
so I think you'd have to add similar "upgrade/downgrade lock" logic to
__xfs_filemap_fault?

I'm not 100% sure I'm correct about that statement, my head is starting
to spin and I'm not sure it's worth the complexity.

I /do/ wonder if range locking would be a better solution here, since we
can safely unlock file ranges that we've already remapped?

--D

> +}
> +
>  /* Unlock both inodes to allow IO and mmap activity. */
>  void
>  xfs_iunlock2_io_mmap(
> @@ -3798,3 +3808,24 @@ xfs_iunlock2_io_mmap(
>  	if (ip1 != ip2)
>  		inode_unlock(VFS_I(ip1));
>  }
> +
> +/*
> + * Unlock the exclusive locks on dest file.
> + * Also unlock the shared locks on src if src and dest are not the same one
> + */
> +void
> +xfs_iunlock2_io_mmap_src_shared(
> +	struct xfs_inode	*src,
> +	struct xfs_inode	*dest)
> +{
> +	struct inode	*src_inode = VFS_I(src);
> +	struct inode	*dest_inode = VFS_I(dest);
> +
> +	inode_unlock(dest_inode);
> +	filemap_invalidate_unlock(dest_inode->i_mapping);
> +	if (src == dest)
> +		return;
> +
> +	inode_unlock_shared(src_inode);
> +	filemap_invalidate_unlock_shared(src_inode->i_mapping);
> +}
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 7be6f8e705ab..c07d4b42cf9d 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>   */
>  #define XFS_INACTIVATING	(1 << 13)
>  
> +/*
> + * A flag indicating reflink copy / remapping is happening to the file as
> + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid
> + * interphering the remapping.
> + */
> +#define XFS_IREMAPPING		(1 << 14)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src,
> +					struct xfs_inode *dest);
> +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src,
> +					struct xfs_inode *dest);
>  
>  #endif	/* __XFS_INODE_H__ */
> -- 
> 2.21.0 (Apple Git-122.2)
>
Dave Chinner July 6, 2022, 1:24 a.m. UTC | #3
On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote:
> During a reflink operation, the IOLOCK and MMAPLOCK of the source file
> are held in exclusive mode for the duration. This prevents reads on the
> source file, which could be a very long time if the source file has
> millions of extents.
> 
> As the source of copy, besides some necessary modification (say dirty page
> flushing), it plays readonly role. Locking source file exclusively through
> out the full reflink copy is unreasonable.
> 
> This patch downgrades exclusive locks on source file to shared modes after
> page cache flushing and before cloning the extents. To avoid source file
> change after lock downgradation, direct write paths take IOLOCK_EXCL on
> seeing reflink copy happening to the files.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
> V2 changes:
>  Commit message
>  Make direct write paths take IOLOCK_EXCL when reflink copy is happening
>  Tiny changes
> ---
>  fs/xfs/xfs_file.c  | 33 ++++++++++++++++++++++++++++++---
>  fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++
>  fs/xfs/xfs_inode.h | 11 +++++++++++
>  3 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5a171c0b244b..6ca7118ee274 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned(
>  	struct iov_iter		*from)
>  {
>  	unsigned int		iolock = XFS_IOLOCK_SHARED;
> +	int			remapping;
>  	ssize_t			ret;
>  
> +relock:
>  	ret = xfs_ilock_iocb(iocb, iolock);
>  	if (ret)
>  		return ret;
> @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned(
>  	if (ret)
>  		goto out_unlock;
>  
> +	remapping = xfs_iflags_test(ip, XFS_IREMAPPING);
> +
>  	/*
>  	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
>  	 * the iolock back to shared if we had to take the exclusive lock in
>  	 * xfs_file_write_checks() for other reasons.
> +	 * But take IOLOCK_EXCL when reflink copy is going on
>  	 */
>  	if (iolock == XFS_IOLOCK_EXCL) {
> -		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> -		iolock = XFS_IOLOCK_SHARED;
> +		if (!remapping) {
> +			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +			iolock = XFS_IOLOCK_SHARED;
> +		}
> +	} else { /* iolock == XFS_ILOCK_SHARED */
> +		if (remapping) {
> +			xfs_iunlock(ip, iolock);
> +			iolock = XFS_IOLOCK_EXCL;
> +			goto relock;
> +		}
>  	}

I'm not sure we can do relocking here. We've already run
xfs_file_write_checks() once which means we've already called
file_modified() and made changes to the inode.

Indeed, if we know that remapping is going on, why are we even
starting with XFS_IOLOCK_SHARED? i.e. shouldn't this just be:

	unsigned int		iolock = XFS_IOLOCK_SHARED;
	ssize_t			ret;

	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
		iolock = XFS_IOLOCK_EXCL;

retry:
	ret = xfs_ilock_iocb(iocb, iolock);
	if (ret)
		return ret;
	if (xfs_iflags_test(ip, XFS_IREMAPPING) &&
	    iolock == XFS_IOLOCK_SHARED) {
		/* Raced with a remap operation starting! */
		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
		iolock = XFS_IOLOCK_EXCL;
		goto restart;
	}
	....

And no other changes need to be made? i.e. we can downgrade the
XFS_IOLOCK_EXCL safely at any time in this path because the barrier
to reflink starting and setting the XFS_IREMAPPING flag is that it
has to be holding XFS_IOLOCK_EXCL and holding the IOLOCK in any mode
will hold off any reflink starting.

However, the key thing to note is that holding the IOLOCK in either
EXCL or SHARED does not not actually guarantee that there are no
DIO writes in progress. We only have to hold the lock over
submission to ensure the inode_dio_begin() call is serialised
correctly.....

i.e. when we do AIO DIO writes, we hold the IOLOCK_SHARED over
submission while the inode->i_dio_count is incremented, but then
drop the IOLOCK before completion occurs.

Hence the only way to guarantee that there is no DIO writes in
progress is to take IOLOCK_EXCL to stop new DIO writes from
starting, and then call inode_dio_wait() to wait for i_dio_count to
fall to zero. Once inode_dio_wait() returns, we're guaranteed that
there are no DIO writes in progress.

IOWs, the demotion from EXCL to SHARED in
xfs_file_dio_write_aligned() code is completely irrelevant from the
perspective of serialising against DIO writes in progress, so we
don't need to touch that code.

However, if we are going to demote the IOLOCK in the reflink code,
we now need a separate barrier to prevent dio writes from starting
while the reflink is in progress. As you've implemented, the
IREMAPPING flag provides this barrier, but your code didn't handle
the TOCTOU races in gaining the IOLOCK in the correct mode in the
DIO path....

>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> @@ -1125,6 +1138,19 @@ xfs_file_remap_range(
>  	if (ret || len == 0)
>  		return ret;
>  
> +	/*
> +	 * Set XFS_IREMAPPING flag to source file before we downgrade
> +	 * the locks, so that all direct writes know they have to take
> +	 * IOLOCK_EXCL.
> +	 */
> +	xfs_iflags_set(src, XFS_IREMAPPING);
> +
> +	/*
> +	 * From now on, we read only from src, so downgrade locks to allow
> +	 * read operations go.
> +	 */
> +	xfs_ilock_io_mmap_downgrade_src(src, dest);

Oh, you've been lucky here. :) xfs_reflink_remap_prep() ->
generic_remap_file_range_prep() calls inode_dio_wait() on both
inodes while they are locked exclusively.

However, I don't think you can downgrade the mmap lock here - that
will allow page faults to dirty the pages in the source file. I'm
also not sure that we can even take the mmap lock exclusively in
the page fault path like we can the IOLOCK in the IO path....

Hence I think it is simplest just to consider it unsafe to downgrade
the mmap lock here and so only the IO lock can be downgraded. For
read IO, that's the only one that needs to be downgraded, anyway.

> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 7be6f8e705ab..c07d4b42cf9d 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
>   */
>  #define XFS_INACTIVATING	(1 << 13)
>  
> +/*
> + * A flag indicating reflink copy / remapping is happening to the file as
> + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid
> + * interphering the remapping.
> + */
> +#define XFS_IREMAPPING		(1 << 14)
> +
>  /* All inode state flags related to inode reclaim. */
>  #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
>  				 XFS_IRECLAIM | \
> @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work);
>  
>  int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
>  void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
> +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src,
> +					struct xfs_inode *dest);
> +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src,
> +					struct xfs_inode *dest);

I suspect we are at the point where we really need to move all the
inode locking out of xfs_inode.[ch] and into separate
xfs_inode_locking.[ch] files. Not necessary for this patchset, but
if we keep adding more locking helpers....

Cheers,

Dave.
Dave Chinner July 6, 2022, 1:35 a.m. UTC | #4
On Tue, Jul 05, 2022 at 11:37:54AM -0700, Darrick J. Wong wrote:
> On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote:
> > During a reflink operation, the IOLOCK and MMAPLOCK of the source file
> > are held in exclusive mode for the duration. This prevents reads on the
> > source file, which could be a very long time if the source file has
> > millions of extents.
> > 
> > As the source of copy, besides some necessary modification (say dirty page
> > flushing), it plays readonly role. Locking source file exclusively through
> > out the full reflink copy is unreasonable.
> > 
> > This patch downgrades exclusive locks on source file to shared modes after
> > page cache flushing and before cloning the extents. To avoid source file
> > change after lock downgradation, direct write paths take IOLOCK_EXCL on
> > seeing reflink copy happening to the files.
.....

> I /do/ wonder if range locking would be a better solution here, since we
> can safely unlock file ranges that we've already remapped?

Depends. The prototype I did allowed concurrent remaps to run on
different ranges of the file. The extent manipulations were still
internally serialised by the ILOCK so the concurrent modifications
were still serialised. Hence things like block mapping lookups for
read IO still serialised. (And hence my interest in lockless btrees
for the extent list so read IO wouldn't need to take the ILOCK at
all.)

However, if you want to remap the entire file, we've still got to
start with locking the entire file range and draining IO and writing
back all dirty data. So cloning a file is still a complete
serialisation event with range locking, but we can optimise away
some of the tail latencies by unlocking ranges remapped range by
remapped range...

Cheers,

Dave.
Wengang Wang Aug. 4, 2022, 2:50 a.m. UTC | #5
Thank you Dave and Darrick, I will think more on this.

Thanks,
Wengang

> On Jul 5, 2022, at 6:35 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Tue, Jul 05, 2022 at 11:37:54AM -0700, Darrick J. Wong wrote:
>> On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote:
>>> During a reflink operation, the IOLOCK and MMAPLOCK of the source file
>>> are held in exclusive mode for the duration. This prevents reads on the
>>> source file, which could be a very long time if the source file has
>>> millions of extents.
>>> 
>>> As the source of copy, besides some necessary modification (say dirty page
>>> flushing), it plays readonly role. Locking source file exclusively through
>>> out the full reflink copy is unreasonable.
>>> 
>>> This patch downgrades exclusive locks on source file to shared modes after
>>> page cache flushing and before cloning the extents. To avoid source file
>>> change after lock downgradation, direct write paths take IOLOCK_EXCL on
>>> seeing reflink copy happening to the files.
> .....
> 
>> I /do/ wonder if range locking would be a better solution here, since we
>> can safely unlock file ranges that we've already remapped?
> 
> Depends. The prototype I did allowed concurrent remaps to run on
> different ranges of the file. The extent manipulations were still
> internally serialised by the ILOCK so the concurrent modifications
> were still serialised. Hence things like block mapping lookups for
> read IO still serialised. (And hence my interest in lockless btrees
> for the extent list so read IO wouldn't need to take the ILOCK at
> all.)
> 
> However, if you want to remap the entire file, we've still got to
> start with locking the entire file range and draining IO and writing
> back all dirty data. So cloning a file is still a complete
> serialisation event with range locking, but we can optimise away
> some of the tail latencies by unlocking ranges remapped range by
> remapped range...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5a171c0b244b..6ca7118ee274 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -514,8 +514,10 @@  xfs_file_dio_write_aligned(
 	struct iov_iter		*from)
 {
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
+	int			remapping;
 	ssize_t			ret;
 
+relock:
 	ret = xfs_ilock_iocb(iocb, iolock);
 	if (ret)
 		return ret;
@@ -523,14 +525,25 @@  xfs_file_dio_write_aligned(
 	if (ret)
 		goto out_unlock;
 
+	remapping = xfs_iflags_test(ip, XFS_IREMAPPING);
+
 	/*
 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
 	 * the iolock back to shared if we had to take the exclusive lock in
 	 * xfs_file_write_checks() for other reasons.
+	 * But take IOLOCK_EXCL when reflink copy is going on
 	 */
 	if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-		iolock = XFS_IOLOCK_SHARED;
+		if (!remapping) {
+			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+			iolock = XFS_IOLOCK_SHARED;
+		}
+	} else { /* iolock == XFS_ILOCK_SHARED */
+		if (remapping) {
+			xfs_iunlock(ip, iolock);
+			iolock = XFS_IOLOCK_EXCL;
+			goto relock;
+		}
 	}
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
@@ -1125,6 +1138,19 @@  xfs_file_remap_range(
 	if (ret || len == 0)
 		return ret;
 
+	/*
+	 * Set XFS_IREMAPPING flag to source file before we downgrade
+	 * the locks, so that all direct writes know they have to take
+	 * IOLOCK_EXCL.
+	 */
+	xfs_iflags_set(src, XFS_IREMAPPING);
+
+	/*
+	 * From now on, we read only from src, so downgrade locks to allow
+	 * read operations go.
+	 */
+	xfs_ilock_io_mmap_downgrade_src(src, dest);
+
 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
 	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
@@ -1152,7 +1178,8 @@  xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iflags_clear(src, XFS_IREMAPPING);
+	xfs_iunlock2_io_mmap_src_shared(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 52d6f2c7d58b..1cbd4a594f28 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3786,6 +3786,16 @@  xfs_ilock2_io_mmap(
 	return 0;
 }
 
+/* Downgrade the locks on src file if src and dest are not the same one. */
+void
+xfs_ilock_io_mmap_downgrade_src(
+	struct xfs_inode	*src,
+	struct xfs_inode	*dest)
+{
+	if (src != dest)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+}
+
 /* Unlock both inodes to allow IO and mmap activity. */
 void
 xfs_iunlock2_io_mmap(
@@ -3798,3 +3808,24 @@  xfs_iunlock2_io_mmap(
 	if (ip1 != ip2)
 		inode_unlock(VFS_I(ip1));
 }
+
+/*
+ * Unlock the exclusive locks on dest file.
+ * Also unlock the shared locks on src if src and dest are not the same one
+ */
+void
+xfs_iunlock2_io_mmap_src_shared(
+	struct xfs_inode	*src,
+	struct xfs_inode	*dest)
+{
+	struct inode	*src_inode = VFS_I(src);
+	struct inode	*dest_inode = VFS_I(dest);
+
+	inode_unlock(dest_inode);
+	filemap_invalidate_unlock(dest_inode->i_mapping);
+	if (src == dest)
+		return;
+
+	inode_unlock_shared(src_inode);
+	filemap_invalidate_unlock_shared(src_inode->i_mapping);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7be6f8e705ab..c07d4b42cf9d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -262,6 +262,13 @@  static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
  */
 #define XFS_INACTIVATING	(1 << 13)
 
+/*
+ * A flag indicating reflink copy / remapping is happening to the file as
+ * source. When set, all direct IOs should take IOLOCK_EXCL to avoid
+ * interphering the remapping.
+ */
+#define XFS_IREMAPPING		(1 << 14)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
@@ -512,5 +519,9 @@  void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src,
+					struct xfs_inode *dest);
+void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src,
+					struct xfs_inode *dest);
 
 #endif	/* __XFS_INODE_H__ */