Message ID | 20220629060755.25537-1-wen.gang.wang@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [V2] xfs: make src file readable during reflink | expand |
Hi, Any thought on this? thanks, wengang > On Jun 28, 2022, at 11:07 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote: > > During a reflink operation, the IOLOCK and MMAPLOCK of the source file > are held in exclusive mode for the duration. This prevents reads on the > source file, which could be a very long time if the source file has > millions of extents. > > As the source of copy, besides some necessary modification (say dirty page > flushing), it plays readonly role. Locking source file exclusively through > out the full reflink copy is unreasonable. > > This patch downgrades exclusive locks on source file to shared modes after > page cache flushing and before cloning the extents. To avoid source file > change after lock downgradation, direct write paths take IOLOCK_EXCL on > seeing reflink copy happening to the files. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > V2 changes: > Commit message > Make direct write paths take IOLOCK_EXCL when reflink copy is happening > Tiny changes > --- > fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++++--- > fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++ > fs/xfs/xfs_inode.h | 11 +++++++++++ > 3 files changed, 72 insertions(+), 3 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 5a171c0b244b..6ca7118ee274 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned( > struct iov_iter *from) > { > unsigned int iolock = XFS_IOLOCK_SHARED; > + int remapping; > ssize_t ret; > > +relock: > ret = xfs_ilock_iocb(iocb, iolock); > if (ret) > return ret; > @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned( > if (ret) > goto out_unlock; > > + remapping = xfs_iflags_test(ip, XFS_IREMAPPING); > + > /* > * We don't need to hold the IOLOCK exclusively across the IO, so demote > * the iolock back to shared if we had to take the exclusive lock in > * xfs_file_write_checks() for other reasons. > + * But take IOLOCK_EXCL when reflink copy is going on > */ > if (iolock == XFS_IOLOCK_EXCL) { > - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > - iolock = XFS_IOLOCK_SHARED; > + if (!remapping) { > + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } > + } else { /* iolock == XFS_ILOCK_SHARED */ > + if (remapping) { > + xfs_iunlock(ip, iolock); > + iolock = XFS_IOLOCK_EXCL; > + goto relock; > + } > } > trace_xfs_file_direct_write(iocb, from); > ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, > @@ -1125,6 +1138,19 @@ xfs_file_remap_range( > if (ret || len == 0) > return ret; > > + /* > + * Set XFS_IREMAPPING flag to source file before we downgrade > + * the locks, so that all direct writes know they have to take > + * IOLOCK_EXCL. > + */ > + xfs_iflags_set(src, XFS_IREMAPPING); > + > + /* > + * From now on, we read only from src, so downgrade locks to allow > + * read operations go. > + */ > + xfs_ilock_io_mmap_downgrade_src(src, dest); > + > trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); > > ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, > @@ -1152,7 +1178,8 @@ xfs_file_remap_range( > if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) > xfs_log_force_inode(dest); > out_unlock: > - xfs_iunlock2_io_mmap(src, dest); > + xfs_iflags_clear(src, XFS_IREMAPPING); > + xfs_iunlock2_io_mmap_src_shared(src, dest); > if (ret) > trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); > return remapped > 0 ? remapped : ret; > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 52d6f2c7d58b..1cbd4a594f28 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -3786,6 +3786,16 @@ xfs_ilock2_io_mmap( > return 0; > } > > +/* Downgrade the locks on src file if src and dest are not the same one. */ > +void > +xfs_ilock_io_mmap_downgrade_src( > + struct xfs_inode *src, > + struct xfs_inode *dest) > +{ > + if (src != dest) > + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); > +} > + > /* Unlock both inodes to allow IO and mmap activity. */ > void > xfs_iunlock2_io_mmap( > @@ -3798,3 +3808,24 @@ xfs_iunlock2_io_mmap( > if (ip1 != ip2) > inode_unlock(VFS_I(ip1)); > } > + > +/* > + * Unlock the exclusive locks on dest file. > + * Also unlock the shared locks on src if src and dest are not the same one > + */ > +void > +xfs_iunlock2_io_mmap_src_shared( > + struct xfs_inode *src, > + struct xfs_inode *dest) > +{ > + struct inode *src_inode = VFS_I(src); > + struct inode *dest_inode = VFS_I(dest); > + > + inode_unlock(dest_inode); > + filemap_invalidate_unlock(dest_inode->i_mapping); > + if (src == dest) > + return; > + > + inode_unlock_shared(src_inode); > + filemap_invalidate_unlock_shared(src_inode->i_mapping); > +} > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 7be6f8e705ab..c07d4b42cf9d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > */ > #define XFS_INACTIVATING (1 << 13) > > +/* > + * A flag indicating reflink copy / remapping is happening to the file as > + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid > + * interphering the remapping. > + */ > +#define XFS_IREMAPPING (1 << 14) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work); > > int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src, > + struct xfs_inode *dest); > +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src, > + struct xfs_inode *dest); > > #endif /* __XFS_INODE_H__ */ > -- > 2.21.0 (Apple Git-122.2) >
On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote: > During a reflink operation, the IOLOCK and MMAPLOCK of the source file > are held in exclusive mode for the duration. This prevents reads on the > source file, which could be a very long time if the source file has > millions of extents. > > As the source of copy, besides some necessary modification (say dirty page > flushing), it plays readonly role. Locking source file exclusively through > out the full reflink copy is unreasonable. > > This patch downgrades exclusive locks on source file to shared modes after > page cache flushing and before cloning the extents. To avoid source file > change after lock downgradation, direct write paths take IOLOCK_EXCL on > seeing reflink copy happening to the files. This is going to complicate the synchronization logic between reflink and everything else quite a bit -- right now we generally allow multiple concurrent direct writers (IOLOCK) and write faults (MMAPLOCK) per file, so space mapping operations (fallocate/reflink) can lock out those writers in a simple manner. > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > V2 changes: > Commit message > Make direct write paths take IOLOCK_EXCL when reflink copy is happening > Tiny changes > --- > fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++++--- > fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++ > fs/xfs/xfs_inode.h | 11 +++++++++++ > 3 files changed, 72 insertions(+), 3 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 5a171c0b244b..6ca7118ee274 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned( > struct iov_iter *from) > { > unsigned int iolock = XFS_IOLOCK_SHARED; > + int remapping; bool? > ssize_t ret; > > +relock: > ret = xfs_ilock_iocb(iocb, iolock); > if (ret) > return ret; > @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned( > if (ret) > goto out_unlock; > > + remapping = xfs_iflags_test(ip, XFS_IREMAPPING); remapping = xfs_has_reflink(mp) && xfs_iflags_test(ip, XFS_IREMAPPING); so that you can skip the locked test on filesystems where remapping isn't possible. > + > /* > * We don't need to hold the IOLOCK exclusively across the IO, so demote > * the iolock back to shared if we had to take the exclusive lock in > * xfs_file_write_checks() for other reasons. > + * But take IOLOCK_EXCL when reflink copy is going on > */ > if (iolock == XFS_IOLOCK_EXCL) { > - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > - iolock = XFS_IOLOCK_SHARED; > + if (!remapping) { > + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } Hm. So the logic in the IOLOCK_EXCL case is that if a directio write takes IOLOCK_EXCL, there can't possibly be a remap operation running. Remap operations themselves always start by taking IOLOCK_EXCL before setting IREMAPPING, so a remap operation cannot set IREMAPPING until after this directio completes. In the IOLOCK_SHARED case below, the directio upgrades to IOLOCK_EXCL if a remap operation is detected. We're protected against IREMAPPING getting set while we hold IOLOCK_SHARED (because remap operations start by taking IOLOCK_EXCL), though in theory we could race with the end of a remapping operation, which at worst will result in an unnecessary IOLOCK_EXCL acquisition, right? There can only be one remapping operation in progress at a time because they will take IOLOCK_EXCL initially and demote to _SHARED, so there shouldn't be any races to setting and clearing IREMAPPING. So I /think/ this works, but concurrency is hard to think about. :/ > + } else { /* iolock == XFS_ILOCK_SHARED */ IOLOCK_SHARED, not ILOCK_SHARED? > + if (remapping) { > + xfs_iunlock(ip, iolock); > + iolock = XFS_IOLOCK_EXCL; > + goto relock; > + } > } > trace_xfs_file_direct_write(iocb, from); > ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, > @@ -1125,6 +1138,19 @@ xfs_file_remap_range( Aren't changes necessary for xfs_file_dio_write_unaligned too? > if (ret || len == 0) > return ret; > > + /* > + * Set XFS_IREMAPPING flag to source file before we downgrade > + * the locks, so that all direct writes know they have to take > + * IOLOCK_EXCL. > + */ > + xfs_iflags_set(src, XFS_IREMAPPING); > + > + /* > + * From now on, we read only from src, so downgrade locks to allow > + * read operations go. > + */ > + xfs_ilock_io_mmap_downgrade_src(src, dest); > + > trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); > > ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, > @@ -1152,7 +1178,8 @@ xfs_file_remap_range( > if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) > xfs_log_force_inode(dest); > out_unlock: > - xfs_iunlock2_io_mmap(src, dest); > + xfs_iflags_clear(src, XFS_IREMAPPING); > + xfs_iunlock2_io_mmap_src_shared(src, dest); > if (ret) > trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); > return remapped > 0 ? remapped : ret; > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 52d6f2c7d58b..1cbd4a594f28 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -3786,6 +3786,16 @@ xfs_ilock2_io_mmap( > return 0; > } > > +/* Downgrade the locks on src file if src and dest are not the same one. */ > +void > +xfs_ilock_io_mmap_downgrade_src( > + struct xfs_inode *src, > + struct xfs_inode *dest) > +{ > + if (src != dest) > + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); Oh, you're downgrading MMAPLOCK_EXCL (aka invalidate_lock) too? That's going to be tricky to figure out -- write page faults (and apparently all DAX faults) take MMAPLOCK_SHARED (invalidate_lock_shared) so I think you'd have to add similar "upgrade/downgrade lock" logic to __xfs_filemap_fault? I'm not 100% sure I'm correct about that statement, my head is starting to spin and I'm not sure it's worth the complexity. I /do/ wonder if range locking would be a better solution here, since we can safely unlock file ranges that we've already remapped? --D > +} > + > /* Unlock both inodes to allow IO and mmap activity. */ > void > xfs_iunlock2_io_mmap( > @@ -3798,3 +3808,24 @@ xfs_iunlock2_io_mmap( > if (ip1 != ip2) > inode_unlock(VFS_I(ip1)); > } > + > +/* > + * Unlock the exclusive locks on dest file. > + * Also unlock the shared locks on src if src and dest are not the same one > + */ > +void > +xfs_iunlock2_io_mmap_src_shared( > + struct xfs_inode *src, > + struct xfs_inode *dest) > +{ > + struct inode *src_inode = VFS_I(src); > + struct inode *dest_inode = VFS_I(dest); > + > + inode_unlock(dest_inode); > + filemap_invalidate_unlock(dest_inode->i_mapping); > + if (src == dest) > + return; > + > + inode_unlock_shared(src_inode); > + filemap_invalidate_unlock_shared(src_inode->i_mapping); > +} > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 7be6f8e705ab..c07d4b42cf9d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > */ > #define XFS_INACTIVATING (1 << 13) > > +/* > + * A flag indicating reflink copy / remapping is happening to the file as > + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid > + * interphering the remapping. > + */ > +#define XFS_IREMAPPING (1 << 14) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work); > > int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src, > + struct xfs_inode *dest); > +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src, > + struct xfs_inode *dest); > > #endif /* __XFS_INODE_H__ */ > -- > 2.21.0 (Apple Git-122.2) >
On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote: > During a reflink operation, the IOLOCK and MMAPLOCK of the source file > are held in exclusive mode for the duration. This prevents reads on the > source file, which could be a very long time if the source file has > millions of extents. > > As the source of copy, besides some necessary modification (say dirty page > flushing), it plays readonly role. Locking source file exclusively through > out the full reflink copy is unreasonable. > > This patch downgrades exclusive locks on source file to shared modes after > page cache flushing and before cloning the extents. To avoid source file > change after lock downgradation, direct write paths take IOLOCK_EXCL on > seeing reflink copy happening to the files. > > Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> > --- > V2 changes: > Commit message > Make direct write paths take IOLOCK_EXCL when reflink copy is happening > Tiny changes > --- > fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++++--- > fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++ > fs/xfs/xfs_inode.h | 11 +++++++++++ > 3 files changed, 72 insertions(+), 3 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 5a171c0b244b..6ca7118ee274 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned( > struct iov_iter *from) > { > unsigned int iolock = XFS_IOLOCK_SHARED; > + int remapping; > ssize_t ret; > > +relock: > ret = xfs_ilock_iocb(iocb, iolock); > if (ret) > return ret; > @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned( > if (ret) > goto out_unlock; > > + remapping = xfs_iflags_test(ip, XFS_IREMAPPING); > + > /* > * We don't need to hold the IOLOCK exclusively across the IO, so demote > * the iolock back to shared if we had to take the exclusive lock in > * xfs_file_write_checks() for other reasons. > + * But take IOLOCK_EXCL when reflink copy is going on > */ > if (iolock == XFS_IOLOCK_EXCL) { > - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > - iolock = XFS_IOLOCK_SHARED; > + if (!remapping) { > + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } > + } else { /* iolock == XFS_ILOCK_SHARED */ > + if (remapping) { > + xfs_iunlock(ip, iolock); > + iolock = XFS_IOLOCK_EXCL; > + goto relock; > + } > } I'm not sure we can do relocking here. We've already run xfs_file_write_checks() once which means we've already called file_modified() and made changes to the inode. Indeed, if we know that remapping is going on, why are we even starting with XFS_IOLOCK_SHARED? i.e. shouldn't this just be: unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; if (!xfs_iflags_test(ip, XFS_IREMAPPING)) iolock = XFS_IOLOCK_EXCL; retry: ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; if (xfs_iflags_test(ip, XFS_IREMAPPING) && iolock == XFS_IOLOCK_SHARED) { /* Raced with a remap operation starting! */ xfs_iunlock(ip, XFS_IOLOCK_SHARED); iolock = XFS_IOLOCK_EXCL; goto restart; } .... And no other changes need to be made? i.e. we can downgrade the XFS_IOLOCK_EXCL safely at any time in this path because the barrier to reflink starting and setting the XFS_IREMAPPING flag is that it has to be holding XFS_IOLOCK_EXCL and holding the IOLOCK in any mode will hold off any reflink starting. However, the key thing to note is that holding the IOLOCK in either EXCL or SHARED does not not actually guarantee that there are no DIO writes in progress. We only have to hold the lock over submission to ensure the inode_dio_begin() call is serialised correctly..... i.e. when we do AIO DIO writes, we hold the IOLOCK_SHARED over submission while the inode->i_dio_count is incremented, but then drop the IOLOCK before completion occurs. Hence the only way to guarantee that there is no DIO writes in progress is to take IOLOCK_EXCL to stop new DIO writes from starting, and then call inode_dio_wait() to wait for i_dio_count to fall to zero. Once inode_dio_wait() returns, we're guaranteed that there are no DIO writes in progress. IOWs, the demotion from EXCL to SHARED in xfs_file_dio_write_aligned() code is completely irrelevant from the perspective of serialising against DIO writes in progress, so we don't need to touch that code. However, if we are going to demote the IOLOCK in the reflink code, we now need a separate barrier to prevent dio writes from starting while the reflink is in progress. As you've implemented, the IREMAPPING flag provides this barrier, but your code didn't handle the TOCTOU races in gaining the IOLOCK in the correct mode in the DIO path.... > trace_xfs_file_direct_write(iocb, from); > ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, > @@ -1125,6 +1138,19 @@ xfs_file_remap_range( > if (ret || len == 0) > return ret; > > + /* > + * Set XFS_IREMAPPING flag to source file before we downgrade > + * the locks, so that all direct writes know they have to take > + * IOLOCK_EXCL. > + */ > + xfs_iflags_set(src, XFS_IREMAPPING); > + > + /* > + * From now on, we read only from src, so downgrade locks to allow > + * read operations go. > + */ > + xfs_ilock_io_mmap_downgrade_src(src, dest); Oh, you've been lucky here. :) xfs_reflink_remap_prep() -> generic_remap_file_range_prep() calls inode_dio_wait() on both inodes while they are locked exclusively. However, I don't think you can downgrade the mmap lock here - that will allow page faults to dirty the pages in the source file. I'm also not sure that we can even take the mmap lock exclusively in the page fault path like we can the IOLOCK in the IO path.... Hence I think it is simplest just to consider it unsafe to downgrade the mmap lock here and so only the IO lock can be downgraded. For read IO, that's the only one that needs to be downgraded, anyway. > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 7be6f8e705ab..c07d4b42cf9d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) > */ > #define XFS_INACTIVATING (1 << 13) > > +/* > + * A flag indicating reflink copy / remapping is happening to the file as > + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid > + * interphering the remapping. > + */ > +#define XFS_IREMAPPING (1 << 14) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work); > > int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); > +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src, > + struct xfs_inode *dest); > +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src, > + struct xfs_inode *dest); I suspect we are at the point where we really need to move all the inode locking out of xfs_inode.[ch] and into separate xfs_inode_locking.[ch] files. Not necessary for this patchset, but if we keep adding more locking helpers.... Cheers, Dave.
On Tue, Jul 05, 2022 at 11:37:54AM -0700, Darrick J. Wong wrote: > On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote: > > During a reflink operation, the IOLOCK and MMAPLOCK of the source file > > are held in exclusive mode for the duration. This prevents reads on the > > source file, which could be a very long time if the source file has > > millions of extents. > > > > As the source of copy, besides some necessary modification (say dirty page > > flushing), it plays readonly role. Locking source file exclusively through > > out the full reflink copy is unreasonable. > > > > This patch downgrades exclusive locks on source file to shared modes after > > page cache flushing and before cloning the extents. To avoid source file > > change after lock downgradation, direct write paths take IOLOCK_EXCL on > > seeing reflink copy happening to the files. ..... > I /do/ wonder if range locking would be a better solution here, since we > can safely unlock file ranges that we've already remapped? Depends. The prototype I did allowed concurrent remaps to run on different ranges of the file. The extent manipulations were still internally serialised by the ILOCK so the concurrent modifications were still serialised. Hence things like block mapping lookups for read IO still serialised. (And hence my interest in lockless btrees for the extent list so read IO wouldn't need to take the ILOCK at all.) However, if you want to remap the entire file, we've still got to start with locking the entire file range and draining IO and writing back all dirty data. So cloning a file is still a complete serialisation event with range locking, but we can optimise away some of the tail latencies by unlocking ranges remapped range by remapped range... Cheers, Dave.
Thank you Dave and Darrick, I will think more on this. Thanks, Wengang > On Jul 5, 2022, at 6:35 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Tue, Jul 05, 2022 at 11:37:54AM -0700, Darrick J. Wong wrote: >> On Tue, Jun 28, 2022 at 11:07:55PM -0700, Wengang Wang wrote: >>> During a reflink operation, the IOLOCK and MMAPLOCK of the source file >>> are held in exclusive mode for the duration. This prevents reads on the >>> source file, which could be a very long time if the source file has >>> millions of extents. >>> >>> As the source of copy, besides some necessary modification (say dirty page >>> flushing), it plays readonly role. Locking source file exclusively through >>> out the full reflink copy is unreasonable. >>> >>> This patch downgrades exclusive locks on source file to shared modes after >>> page cache flushing and before cloning the extents. To avoid source file >>> change after lock downgradation, direct write paths take IOLOCK_EXCL on >>> seeing reflink copy happening to the files. > ..... > >> I /do/ wonder if range locking would be a better solution here, since we >> can safely unlock file ranges that we've already remapped? > > Depends. The prototype I did allowed concurrent remaps to run on > different ranges of the file. The extent manipulations were still > internally serialised by the ILOCK so the concurrent modifications > were still serialised. Hence things like block mapping lookups for > read IO still serialised. (And hence my interest in lockless btrees > for the extent list so read IO wouldn't need to take the ILOCK at > all.) > > However, if you want to remap the entire file, we've still got to > start with locking the entire file range and draining IO and writing > back all dirty data. So cloning a file is still a complete > serialisation event with range locking, but we can optimise away > some of the tail latencies by unlocking ranges remapped range by > remapped range... > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b..6ca7118ee274 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -514,8 +514,10 @@ xfs_file_dio_write_aligned( struct iov_iter *from) { unsigned int iolock = XFS_IOLOCK_SHARED; + int remapping; ssize_t ret; +relock: ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; @@ -523,14 +525,25 @@ xfs_file_dio_write_aligned( if (ret) goto out_unlock; + remapping = xfs_iflags_test(ip, XFS_IREMAPPING); + /* * We don't need to hold the IOLOCK exclusively across the IO, so demote * the iolock back to shared if we had to take the exclusive lock in * xfs_file_write_checks() for other reasons. + * But take IOLOCK_EXCL when reflink copy is going on */ if (iolock == XFS_IOLOCK_EXCL) { - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + if (!remapping) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + } else { /* iolock == XFS_ILOCK_SHARED */ + if (remapping) { + xfs_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + goto relock; + } } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, @@ -1125,6 +1138,19 @@ xfs_file_remap_range( if (ret || len == 0) return ret; + /* + * Set XFS_IREMAPPING flag to source file before we downgrade + * the locks, so that all direct writes know they have to take + * IOLOCK_EXCL. + */ + xfs_iflags_set(src, XFS_IREMAPPING); + + /* + * From now on, we read only from src, so downgrade locks to allow + * read operations go. + */ + xfs_ilock_io_mmap_downgrade_src(src, dest); + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, @@ -1152,7 +1178,8 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: - xfs_iunlock2_io_mmap(src, dest); + xfs_iflags_clear(src, XFS_IREMAPPING); + xfs_iunlock2_io_mmap_src_shared(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 52d6f2c7d58b..1cbd4a594f28 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3786,6 +3786,16 @@ xfs_ilock2_io_mmap( return 0; } +/* Downgrade the locks on src file if src and dest are not the same one. */ +void +xfs_ilock_io_mmap_downgrade_src( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + if (src != dest) + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); +} + /* Unlock both inodes to allow IO and mmap activity. */ void xfs_iunlock2_io_mmap( @@ -3798,3 +3808,24 @@ xfs_iunlock2_io_mmap( if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } + +/* + * Unlock the exclusive locks on dest file. + * Also unlock the shared locks on src if src and dest are not the same one + */ +void +xfs_iunlock2_io_mmap_src_shared( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + struct inode *src_inode = VFS_I(src); + struct inode *dest_inode = VFS_I(dest); + + inode_unlock(dest_inode); + filemap_invalidate_unlock(dest_inode->i_mapping); + if (src == dest) + return; + + inode_unlock_shared(src_inode); + filemap_invalidate_unlock_shared(src_inode->i_mapping); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7be6f8e705ab..c07d4b42cf9d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -262,6 +262,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) */ #define XFS_INACTIVATING (1 << 13) +/* + * A flag indicating reflink copy / remapping is happening to the file as + * source. When set, all direct IOs should take IOLOCK_EXCL to avoid + * interphering the remapping. + */ +#define XFS_IREMAPPING (1 << 14) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -512,5 +519,9 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_ilock_io_mmap_downgrade_src(struct xfs_inode *src, + struct xfs_inode *dest); +void xfs_iunlock2_io_mmap_src_shared(struct xfs_inode *src, + struct xfs_inode *dest); #endif /* __XFS_INODE_H__ */
During a reflink operation, the IOLOCK and MMAPLOCK of the source file are held in exclusive mode for the duration. This prevents reads on the source file, which could be a very long time if the source file has millions of extents. As the source of copy, besides some necessary modification (say dirty page flushing), it plays readonly role. Locking source file exclusively through out the full reflink copy is unreasonable. This patch downgrades exclusive locks on source file to shared modes after page cache flushing and before cloning the extents. To avoid source file change after lock downgradation, direct write paths take IOLOCK_EXCL on seeing reflink copy happening to the files. Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> --- V2 changes: Commit message Make direct write paths take IOLOCK_EXCL when reflink copy is happening Tiny changes --- fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++++--- fs/xfs/xfs_inode.c | 31 +++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 11 +++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-)