Message ID | 20180717232405.18511-7-hch@lst.de (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote: > Used the per-fork sequence counter to avoid lookups in the writeback code > unless the COW fork actually changed. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_aops.c | 30 +++++++++++++++++++++++++----- > fs/xfs/xfs_iomap.c | 5 ++++- > fs/xfs/xfs_iomap.h | 2 +- > 3 files changed, 30 insertions(+), 7 deletions(-) > Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 814100d27343..aff9d44fa338 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -29,6 +29,7 @@ > struct xfs_writepage_ctx { > struct xfs_bmbt_irec imap; > unsigned int io_type; > + unsigned int cow_seq; > struct xfs_ioend *ioend; > }; > > @@ -310,6 +311,7 @@ xfs_map_blocks( > struct xfs_mount *mp = ip->i_mount; > ssize_t count = i_blocksize(inode); > xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; > + xfs_fileoff_t cow_fsb = NULLFILEOFF; > struct xfs_bmbt_irec imap; > int whichfork = XFS_DATA_FORK; > struct xfs_iext_cursor icur; > @@ -333,12 +335,15 @@ xfs_map_blocks( > * COW fork blocks can overlap data fork blocks even if the blocks > * aren't shared. COW I/O always takes precedent, so we must always > * check for overlap on reflink inodes unless the mapping is already a > - * COW one. > + * COW one, or the COW fork hasn't changed from the last time we looked > + * at it. > */ > imap_valid = offset_fsb >= wpc->imap.br_startoff && > offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; > if (imap_valid && > - (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW)) > + (!xfs_inode_has_cow_data(ip) || > + wpc->io_type == XFS_IO_COW || > + wpc->cow_seq == ip->i_cowfp->if_seq)) > return 0; > > if (XFS_FORCED_SHUTDOWN(mp)) > @@ -364,8 +369,10 @@ xfs_map_blocks( > * it directly instead of looking up anything in the data fork. > */ > if (xfs_inode_has_cow_data(ip) && > - xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) && > - imap.br_startoff <= offset_fsb) { > + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) > + cow_fsb = imap.br_startoff; > + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { > + wpc->cow_seq = ip->i_cowfp->if_seq; > xfs_iunlock(ip, XFS_ILOCK_SHARED); > /* > * Truncate can race with writeback since writeback doesn't > @@ -411,6 +418,16 @@ xfs_map_blocks( > imap.br_startblock = HOLESTARTBLOCK; > wpc->io_type = XFS_IO_HOLE; > } else { > + /* > + * Truncate to the next COW extent if there is one. This is the > + * only opportunity to do this because we can skip COW fork > + * lookups for the subsequent blocks in the mapping; however, > + * the requirement to treat the COW range separately remains. > + */ > + if (cow_fsb != NULLFILEOFF && > + cow_fsb < imap.br_startoff + imap.br_blockcount) > + imap.br_blockcount = cow_fsb - imap.br_startoff; > + > if (isnullstartblock(imap.br_startblock)) { > /* got a delalloc extent */ > wpc->io_type = XFS_IO_DELALLOC; > @@ -427,9 +444,12 @@ xfs_map_blocks( > trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); > return 0; > allocate_blocks: > - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap); > + error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, > + &wpc->cow_seq); > if (error) > return error; > + ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || > + imap.br_startoff + imap.br_blockcount <= cow_fsb); > wpc->imap = imap; > trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); > return 0; > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > index 756694219f77..43a7ff7f450c 100644 > --- a/fs/xfs/xfs_iomap.c > +++ b/fs/xfs/xfs_iomap.c > @@ -658,7 +658,8 @@ xfs_iomap_write_allocate( > xfs_inode_t *ip, > int whichfork, > xfs_off_t offset, > - xfs_bmbt_irec_t *imap) > + xfs_bmbt_irec_t *imap, > + unsigned int *cow_seq) > { > xfs_mount_t *mp = ip->i_mount; > xfs_fileoff_t offset_fsb, last_block; > @@ -780,6 +781,8 @@ xfs_iomap_write_allocate( > if (error) > goto error0; > > + if (whichfork == XFS_COW_FORK) > + *cow_seq = XFS_IFORK_PTR(ip, whichfork)->if_seq; > xfs_iunlock(ip, XFS_ILOCK_EXCL); > } > > diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h > index 83474c9cede9..c6170548831b 100644 > --- a/fs/xfs/xfs_iomap.h > +++ b/fs/xfs/xfs_iomap.h > @@ -14,7 +14,7 @@ struct xfs_bmbt_irec; > int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, > struct xfs_bmbt_irec *, int); > int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, > - struct xfs_bmbt_irec *); > + struct xfs_bmbt_irec *, unsigned int *); > int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); > > void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, > -- > 2.18.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote: > @@ -333,12 +335,15 @@ xfs_map_blocks( > * COW fork blocks can overlap data fork blocks even if the blocks > * aren't shared. COW I/O always takes precedent, so we must always > * check for overlap on reflink inodes unless the mapping is already a > - * COW one. > + * COW one, or the COW fork hasn't changed from the last time we looked > + * at it. > */ > imap_valid = offset_fsb >= wpc->imap.br_startoff && > offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; > if (imap_valid && > - (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW)) > + (!xfs_inode_has_cow_data(ip) || > + wpc->io_type == XFS_IO_COW || > + wpc->cow_seq == ip->i_cowfp->if_seq)) > return 0; This seqno check is still racy against concurrent extent tree modification. We aren't holding the ILOCK here at all, the sequence numbers aren't atomic and there are no memory barriers to serialise cross-cpu load/store operations... Cheers, Dave.
On Sun, Jul 22, 2018 at 09:23:11AM +1000, Dave Chinner wrote: > On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote: > > @@ -333,12 +335,15 @@ xfs_map_blocks( > > * COW fork blocks can overlap data fork blocks even if the blocks > > * aren't shared. COW I/O always takes precedent, so we must always > > * check for overlap on reflink inodes unless the mapping is already a > > - * COW one. > > + * COW one, or the COW fork hasn't changed from the last time we looked > > + * at it. > > */ > > imap_valid = offset_fsb >= wpc->imap.br_startoff && > > offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; > > if (imap_valid && > > - (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW)) > > + (!xfs_inode_has_cow_data(ip) || > > + wpc->io_type == XFS_IO_COW || > > + wpc->cow_seq == ip->i_cowfp->if_seq)) > > return 0; > > This seqno check is still racy against concurrent extent tree > modification. We aren't holding the ILOCK here at all, the sequence > numbers aren't atomic and there are no memory barriers to serialise > cross-cpu load/store operations... mutex_unlock contains a barrier, and the sequence is a single register read. There is nothing holding a lock here would help us with. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jul 23, 2018 at 09:49:41AM +0200, Christoph Hellwig wrote: > On Sun, Jul 22, 2018 at 09:23:11AM +1000, Dave Chinner wrote: > > On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote: > > > @@ -333,12 +335,15 @@ xfs_map_blocks( > > > * COW fork blocks can overlap data fork blocks even if the blocks > > > * aren't shared. COW I/O always takes precedent, so we must always > > > * check for overlap on reflink inodes unless the mapping is already a > > > - * COW one. > > > + * COW one, or the COW fork hasn't changed from the last time we looked > > > + * at it. > > > */ > > > imap_valid = offset_fsb >= wpc->imap.br_startoff && > > > offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; > > > if (imap_valid && > > > - (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW)) > > > + (!xfs_inode_has_cow_data(ip) || > > > + wpc->io_type == XFS_IO_COW || > > > + wpc->cow_seq == ip->i_cowfp->if_seq)) > > > return 0; > > > > This seqno check is still racy against concurrent extent tree > > modification. We aren't holding the ILOCK here at all, the sequence > > numbers aren't atomic and there are no memory barriers to serialise > > cross-cpu load/store operations... > > mutex_unlock contains a barrier, and the sequence is a single register > read. There is nothing holding a lock here would help us with. Which mutex_unlock is that? I took a second look at the i_cowfp access and realized that we don't take ILOCK_SHARED until after the comparison. Writeback doesn't take any of the other inode locks AFAIK... so either there's some locking subtlety here that ought to be explained in a comment, or is this a bug waiting to happen? --D > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 24, 2018 at 03:35:23PM -0700, Darrick J. Wong wrote: > > mutex_unlock contains a barrier, and the sequence is a single register > > read. There is nothing holding a lock here would help us with. > > Which mutex_unlock is that? Actually an up_write (on i_lock), but the result is the same. > I took a second look at the i_cowfp access > and realized that we don't take ILOCK_SHARED until after the comparison. > Writeback doesn't take any of the other inode locks AFAIK... so either > there's some locking subtlety here that ought to be explained in a > comment, or is this a bug waiting to happen? What would you want the lock to protect? As said above the if_seq field itself doesn't need a lock, it can be read atomic because it is a register or less. And for the actual imap we don't have a lock anywhere in the writeback code - as soon as we did the lookup or allocation we drop the i_lock and keep using it. That is generally fine because we only ever remove extents from it after previously waiting for writeback and using the iolock to prevent new writeback from being started. The only new bit with reflink support is that a valid data fork block mapping might now be shadowed by a new reflink mapping, and that is exactly what the sequence count protects against (instead of doing a lookup in the cowfp and then instanctly dropping the lock before) -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 27, 2018 at 05:10:39PM +0200, Christoph Hellwig wrote: > On Tue, Jul 24, 2018 at 03:35:23PM -0700, Darrick J. Wong wrote: > > > mutex_unlock contains a barrier, and the sequence is a single register > > > read. There is nothing holding a lock here would help us with. > > > > Which mutex_unlock is that? > > Actually an up_write (on i_lock), but the result is the same. I'm not sure it is - I've lost count of the number of times I've heard the phrase "unlock does not provide a memory barrier, only unlock to lock provides a full memory barrier". As such, I've always treated unlock as a store (release) barrier to keep stores inside the critical section, and lock as the correspending load barrier to keep loads inside the critical section. I think that's irrelevant, anyway, because there is no memory barrier implied until the ILOCK is dropped. i.e. we can change the value before making the extent modification, but there's no guarantee that it's visible to other CPUs until the store memory barrier occurs when the ILOCK is dropped. i.e. after the extent map has actually been changed. And, FWIW, without read-side memory barriers before checking the value, the load can be re-ordered and perhaps even elided by the optimising compiler. IOWs, I can't see any reliable access serialisation being provided by holding the ILOCK while modifying the value. Perhaps this should use WRITE_ONCE/READ_ONCE to ensure the correct memory barriers are used to serialise tthe sequence count against itself rather than the wider inode extent modification process? Cheers, Dave.
On Mon, Aug 06, 2018 at 12:37:38PM +1000, Dave Chinner wrote: > IOWs, I can't see any reliable access serialisation being provided > by holding the ILOCK while modifying the value. Perhaps this should > use WRITE_ONCE/READ_ONCE to ensure the correct memory barriers are > used to serialise tthe sequence count against itself rather than the > wider inode extent modification process? Yes, I think we need a barrier before the later actual modification of the extent tree and thus should use WRITE_ONCE/READ_ONCE. I've got a patch that is undergoing testing at the moment. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 814100d27343..aff9d44fa338 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -29,6 +29,7 @@ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; unsigned int io_type; + unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -310,6 +311,7 @@ xfs_map_blocks( struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; @@ -333,12 +335,15 @@ xfs_map_blocks( * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always * check for overlap on reflink inodes unless the mapping is already a - * COW one. + * COW one, or the COW fork hasn't changed from the last time we looked + * at it. */ imap_valid = offset_fsb >= wpc->imap.br_startoff && offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; if (imap_valid && - (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW)) + (!xfs_inode_has_cow_data(ip) || + wpc->io_type == XFS_IO_COW || + wpc->cow_seq == ip->i_cowfp->if_seq)) return 0; if (XFS_FORCED_SHUTDOWN(mp)) @@ -364,8 +369,10 @@ xfs_map_blocks( * it directly instead of looking up anything in the data fork. */ if (xfs_inode_has_cow_data(ip) && - xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) && - imap.br_startoff <= offset_fsb) { + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + cow_fsb = imap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + wpc->cow_seq = ip->i_cowfp->if_seq; xfs_iunlock(ip, XFS_ILOCK_SHARED); /* * Truncate can race with writeback since writeback doesn't @@ -411,6 +418,16 @@ xfs_map_blocks( imap.br_startblock = HOLESTARTBLOCK; wpc->io_type = XFS_IO_HOLE; } else { + /* + * Truncate to the next COW extent if there is one. This is the + * only opportunity to do this because we can skip COW fork + * lookups for the subsequent blocks in the mapping; however, + * the requirement to treat the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + if (isnullstartblock(imap.br_startblock)) { /* got a delalloc extent */ wpc->io_type = XFS_IO_DELALLOC; @@ -427,9 +444,12 @@ xfs_map_blocks( trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap); + error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, + &wpc->cow_seq); if (error) return error; + ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || + imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 756694219f77..43a7ff7f450c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -658,7 +658,8 @@ xfs_iomap_write_allocate( xfs_inode_t *ip, int whichfork, xfs_off_t offset, - xfs_bmbt_irec_t *imap) + xfs_bmbt_irec_t *imap, + unsigned int *cow_seq) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, last_block; @@ -780,6 +781,8 @@ xfs_iomap_write_allocate( if (error) goto error0; + if (whichfork == XFS_COW_FORK) + *cow_seq = XFS_IFORK_PTR(ip, whichfork)->if_seq; xfs_iunlock(ip, XFS_ILOCK_EXCL); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 83474c9cede9..c6170548831b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -14,7 +14,7 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *); + struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
Used the per-fork sequence counter to avoid lookups in the writeback code unless the COW fork actually changed. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_aops.c | 30 +++++++++++++++++++++++++----- fs/xfs/xfs_iomap.c | 5 ++++- fs/xfs/xfs_iomap.h | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-)