diff mbox

[6/6] xfs: avoid COW fork extent lookups in writeback if the fork didn't change

Message ID 20180717232405.18511-7-hch@lst.de (mailing list archive)
State Accepted
Headers show

Commit Message

hch@lst.de July 17, 2018, 11:24 p.m. UTC
Used the per-fork sequence counter to avoid lookups in the writeback code
unless the COW fork actually changed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c  | 30 +++++++++++++++++++++++++-----
 fs/xfs/xfs_iomap.c |  5 ++++-
 fs/xfs/xfs_iomap.h |  2 +-
 3 files changed, 30 insertions(+), 7 deletions(-)

Comments

Carlos Maiolino July 18, 2018, 2:51 p.m. UTC | #1
On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote:
> Used the per-fork sequence counter to avoid lookups in the writeback code
> unless the COW fork actually changed.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_aops.c  | 30 +++++++++++++++++++++++++-----
>  fs/xfs/xfs_iomap.c |  5 ++++-
>  fs/xfs/xfs_iomap.h |  2 +-
>  3 files changed, 30 insertions(+), 7 deletions(-)
> 

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 814100d27343..aff9d44fa338 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -29,6 +29,7 @@
>  struct xfs_writepage_ctx {
>  	struct xfs_bmbt_irec    imap;
>  	unsigned int		io_type;
> +	unsigned int		cow_seq;
>  	struct xfs_ioend	*ioend;
>  };
>  
> @@ -310,6 +311,7 @@ xfs_map_blocks(
>  	struct xfs_mount	*mp = ip->i_mount;
>  	ssize_t			count = i_blocksize(inode);
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
> +	xfs_fileoff_t		cow_fsb = NULLFILEOFF;
>  	struct xfs_bmbt_irec	imap;
>  	int			whichfork = XFS_DATA_FORK;
>  	struct xfs_iext_cursor	icur;
> @@ -333,12 +335,15 @@ xfs_map_blocks(
>  	 * COW fork blocks can overlap data fork blocks even if the blocks
>  	 * aren't shared.  COW I/O always takes precedent, so we must always
>  	 * check for overlap on reflink inodes unless the mapping is already a
> -	 * COW one.
> +	 * COW one, or the COW fork hasn't changed from the last time we looked
> +	 * at it.
>  	 */
>  	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
>  		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
>  	if (imap_valid &&
> -	    (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
> +	    (!xfs_inode_has_cow_data(ip) ||
> +	     wpc->io_type == XFS_IO_COW ||
> +	     wpc->cow_seq == ip->i_cowfp->if_seq))
>  		return 0;
>  
>  	if (XFS_FORCED_SHUTDOWN(mp))
> @@ -364,8 +369,10 @@ xfs_map_blocks(
>  	 * it directly instead of looking up anything in the data fork.
>  	 */
>  	if (xfs_inode_has_cow_data(ip) &&
> -	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
> -	    imap.br_startoff <= offset_fsb) {
> +	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
> +		cow_fsb = imap.br_startoff;
> +	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
> +		wpc->cow_seq = ip->i_cowfp->if_seq;
>  		xfs_iunlock(ip, XFS_ILOCK_SHARED);
>  		/*
>  		 * Truncate can race with writeback since writeback doesn't
> @@ -411,6 +418,16 @@ xfs_map_blocks(
>  		imap.br_startblock = HOLESTARTBLOCK;
>  		wpc->io_type = XFS_IO_HOLE;
>  	} else {
> +		/*
> +		 * Truncate to the next COW extent if there is one.  This is the
> +		 * only opportunity to do this because we can skip COW fork
> +		 * lookups for the subsequent blocks in the mapping; however,
> +		 * the requirement to treat the COW range separately remains.
> +		 */
> +		if (cow_fsb != NULLFILEOFF &&
> +		    cow_fsb < imap.br_startoff + imap.br_blockcount)
> +			imap.br_blockcount = cow_fsb - imap.br_startoff;
> +
>  		if (isnullstartblock(imap.br_startblock)) {
>  			/* got a delalloc extent */
>  			wpc->io_type = XFS_IO_DELALLOC;
> @@ -427,9 +444,12 @@ xfs_map_blocks(
>  	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
>  	return 0;
>  allocate_blocks:
> -	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
> +	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
> +			&wpc->cow_seq);
>  	if (error)
>  		return error;
> +	ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
> +	       imap.br_startoff + imap.br_blockcount <= cow_fsb);
>  	wpc->imap = imap;
>  	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
>  	return 0;
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 756694219f77..43a7ff7f450c 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -658,7 +658,8 @@ xfs_iomap_write_allocate(
>  	xfs_inode_t	*ip,
>  	int		whichfork,
>  	xfs_off_t	offset,
> -	xfs_bmbt_irec_t *imap)
> +	xfs_bmbt_irec_t *imap,
> +	unsigned int	*cow_seq)
>  {
>  	xfs_mount_t	*mp = ip->i_mount;
>  	xfs_fileoff_t	offset_fsb, last_block;
> @@ -780,6 +781,8 @@ xfs_iomap_write_allocate(
>  			if (error)
>  				goto error0;
>  
> +			if (whichfork == XFS_COW_FORK)
> +				*cow_seq = XFS_IFORK_PTR(ip, whichfork)->if_seq;
>  			xfs_iunlock(ip, XFS_ILOCK_EXCL);
>  		}
>  
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 83474c9cede9..c6170548831b 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -14,7 +14,7 @@ struct xfs_bmbt_irec;
>  int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
>  			struct xfs_bmbt_irec *, int);
>  int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
> -			struct xfs_bmbt_irec *);
> +			struct xfs_bmbt_irec *, unsigned int *);
>  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
>  
>  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
> -- 
> 2.18.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner July 21, 2018, 11:23 p.m. UTC | #2
On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote:
> @@ -333,12 +335,15 @@ xfs_map_blocks(
>  	 * COW fork blocks can overlap data fork blocks even if the blocks
>  	 * aren't shared.  COW I/O always takes precedent, so we must always
>  	 * check for overlap on reflink inodes unless the mapping is already a
> -	 * COW one.
> +	 * COW one, or the COW fork hasn't changed from the last time we looked
> +	 * at it.
>  	 */
>  	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
>  		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
>  	if (imap_valid &&
> -	    (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
> +	    (!xfs_inode_has_cow_data(ip) ||
> +	     wpc->io_type == XFS_IO_COW ||
> +	     wpc->cow_seq == ip->i_cowfp->if_seq))
>  		return 0;

This seqno check is still racy against concurrent extent tree
modification. We aren't holding the ILOCK here at all, the sequence
numbers aren't atomic and there are no memory barriers to serialise
cross-cpu load/store operations...

Cheers,

Dave.
hch@lst.de July 23, 2018, 7:49 a.m. UTC | #3
On Sun, Jul 22, 2018 at 09:23:11AM +1000, Dave Chinner wrote:
> On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote:
> > @@ -333,12 +335,15 @@ xfs_map_blocks(
> >  	 * COW fork blocks can overlap data fork blocks even if the blocks
> >  	 * aren't shared.  COW I/O always takes precedent, so we must always
> >  	 * check for overlap on reflink inodes unless the mapping is already a
> > -	 * COW one.
> > +	 * COW one, or the COW fork hasn't changed from the last time we looked
> > +	 * at it.
> >  	 */
> >  	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
> >  		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
> >  	if (imap_valid &&
> > -	    (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
> > +	    (!xfs_inode_has_cow_data(ip) ||
> > +	     wpc->io_type == XFS_IO_COW ||
> > +	     wpc->cow_seq == ip->i_cowfp->if_seq))
> >  		return 0;
> 
> This seqno check is still racy against concurrent extent tree
> modification. We aren't holding the ILOCK here at all, the sequence
> numbers aren't atomic and there are no memory barriers to serialise
> cross-cpu load/store operations...

mutex_unlock contains a barrier, and the sequence is a single register
read.  There is nothing holding a lock here would help us with.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong July 24, 2018, 10:35 p.m. UTC | #4
On Mon, Jul 23, 2018 at 09:49:41AM +0200, Christoph Hellwig wrote:
> On Sun, Jul 22, 2018 at 09:23:11AM +1000, Dave Chinner wrote:
> > On Tue, Jul 17, 2018 at 04:24:05PM -0700, Christoph Hellwig wrote:
> > > @@ -333,12 +335,15 @@ xfs_map_blocks(
> > >  	 * COW fork blocks can overlap data fork blocks even if the blocks
> > >  	 * aren't shared.  COW I/O always takes precedent, so we must always
> > >  	 * check for overlap on reflink inodes unless the mapping is already a
> > > -	 * COW one.
> > > +	 * COW one, or the COW fork hasn't changed from the last time we looked
> > > +	 * at it.
> > >  	 */
> > >  	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
> > >  		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
> > >  	if (imap_valid &&
> > > -	    (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
> > > +	    (!xfs_inode_has_cow_data(ip) ||
> > > +	     wpc->io_type == XFS_IO_COW ||
> > > +	     wpc->cow_seq == ip->i_cowfp->if_seq))
> > >  		return 0;
> > 
> > This seqno check is still racy against concurrent extent tree
> > modification. We aren't holding the ILOCK here at all, the sequence
> > numbers aren't atomic and there are no memory barriers to serialise
> > cross-cpu load/store operations...
> 
> mutex_unlock contains a barrier, and the sequence is a single register
> read.  There is nothing holding a lock here would help us with.

Which mutex_unlock is that?  I took a second look at the i_cowfp access
and realized that we don't take ILOCK_SHARED until after the comparison.
Writeback doesn't take any of the other inode locks AFAIK... so either
there's some locking subtlety here that ought to be explained in a
comment, or is this a bug waiting to happen?

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
hch@lst.de July 27, 2018, 3:10 p.m. UTC | #5
On Tue, Jul 24, 2018 at 03:35:23PM -0700, Darrick J. Wong wrote:
> > mutex_unlock contains a barrier, and the sequence is a single register
> > read.  There is nothing holding a lock here would help us with.
> 
> Which mutex_unlock is that?

Actually an up_write (on i_lock), but the result is the same.

> I took a second look at the i_cowfp access
> and realized that we don't take ILOCK_SHARED until after the comparison.
> Writeback doesn't take any of the other inode locks AFAIK... so either
> there's some locking subtlety here that ought to be explained in a
> comment, or is this a bug waiting to happen?

What would you want the lock to protect?  As said above the if_seq
field itself doesn't need a lock, it can be read atomic because it
is a register or less.

And for the actual imap we don't have a lock anywhere in the writeback
code - as soon as we did the lookup or allocation we drop the i_lock
and keep using it.  That is generally fine because we only ever remove
extents from it after previously waiting for writeback and using
the iolock to prevent new writeback from being started.  The only new
bit with reflink support is that a valid data fork block mapping might
now be shadowed by a new reflink mapping, and that is exactly what the
sequence count protects against (instead of doing a lookup in the
cowfp and then instanctly dropping the lock before)
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Aug. 6, 2018, 2:37 a.m. UTC | #6
On Fri, Jul 27, 2018 at 05:10:39PM +0200, Christoph Hellwig wrote:
> On Tue, Jul 24, 2018 at 03:35:23PM -0700, Darrick J. Wong wrote:
> > > mutex_unlock contains a barrier, and the sequence is a single register
> > > read.  There is nothing holding a lock here would help us with.
> > 
> > Which mutex_unlock is that?
> 
> Actually an up_write (on i_lock), but the result is the same.

I'm not sure it is - I've lost count of the number of times I've
heard the phrase "unlock does not provide a memory barrier, only
unlock to lock provides a full memory barrier".  As such, I've
always treated unlock as a store (release) barrier to keep stores
inside the critical section, and lock as the correspending load
barrier to keep loads inside the critical section.

I think that's irrelevant, anyway, because there is no memory
barrier implied until the ILOCK is dropped. i.e. we can change the
value before making the extent modification, but there's no
guarantee that it's visible to other CPUs until the store memory
barrier occurs when the ILOCK is dropped. i.e. after the extent map
has actually been changed.

And, FWIW, without read-side memory barriers before checking the
value, the load can be re-ordered and perhaps even elided by the
optimising compiler.

IOWs, I can't see any reliable access serialisation being provided
by holding the ILOCK while modifying the value. Perhaps this should
use WRITE_ONCE/READ_ONCE to ensure the correct memory barriers are
used to serialise tthe sequence count against itself rather than the
wider inode extent modification process?

Cheers,

Dave.
hch@lst.de Aug. 6, 2018, 4:45 p.m. UTC | #7
On Mon, Aug 06, 2018 at 12:37:38PM +1000, Dave Chinner wrote:
> IOWs, I can't see any reliable access serialisation being provided
> by holding the ILOCK while modifying the value. Perhaps this should
> use WRITE_ONCE/READ_ONCE to ensure the correct memory barriers are
> used to serialise tthe sequence count against itself rather than the
> wider inode extent modification process?

Yes, I think we need a barrier before the later actual modification
of the extent tree and thus should use WRITE_ONCE/READ_ONCE.

I've got a patch that is undergoing testing at the moment.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 814100d27343..aff9d44fa338 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -29,6 +29,7 @@ 
 struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
 	unsigned int		io_type;
+	unsigned int		cow_seq;
 	struct xfs_ioend	*ioend;
 };
 
@@ -310,6 +311,7 @@  xfs_map_blocks(
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			count = i_blocksize(inode);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+	xfs_fileoff_t		cow_fsb = NULLFILEOFF;
 	struct xfs_bmbt_irec	imap;
 	int			whichfork = XFS_DATA_FORK;
 	struct xfs_iext_cursor	icur;
@@ -333,12 +335,15 @@  xfs_map_blocks(
 	 * COW fork blocks can overlap data fork blocks even if the blocks
 	 * aren't shared.  COW I/O always takes precedent, so we must always
 	 * check for overlap on reflink inodes unless the mapping is already a
-	 * COW one.
+	 * COW one, or the COW fork hasn't changed from the last time we looked
+	 * at it.
 	 */
 	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
 		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
 	if (imap_valid &&
-	    (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
+	    (!xfs_inode_has_cow_data(ip) ||
+	     wpc->io_type == XFS_IO_COW ||
+	     wpc->cow_seq == ip->i_cowfp->if_seq))
 		return 0;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -364,8 +369,10 @@  xfs_map_blocks(
 	 * it directly instead of looking up anything in the data fork.
 	 */
 	if (xfs_inode_has_cow_data(ip) &&
-	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
-	    imap.br_startoff <= offset_fsb) {
+	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+		cow_fsb = imap.br_startoff;
+	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+		wpc->cow_seq = ip->i_cowfp->if_seq;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		/*
 		 * Truncate can race with writeback since writeback doesn't
@@ -411,6 +418,16 @@  xfs_map_blocks(
 		imap.br_startblock = HOLESTARTBLOCK;
 		wpc->io_type = XFS_IO_HOLE;
 	} else {
+		/*
+		 * Truncate to the next COW extent if there is one.  This is the
+		 * only opportunity to do this because we can skip COW fork
+		 * lookups for the subsequent blocks in the mapping; however,
+		 * the requirement to treat the COW range separately remains.
+		 */
+		if (cow_fsb != NULLFILEOFF &&
+		    cow_fsb < imap.br_startoff + imap.br_blockcount)
+			imap.br_blockcount = cow_fsb - imap.br_startoff;
+
 		if (isnullstartblock(imap.br_startblock)) {
 			/* got a delalloc extent */
 			wpc->io_type = XFS_IO_DELALLOC;
@@ -427,9 +444,12 @@  xfs_map_blocks(
 	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
 	return 0;
 allocate_blocks:
-	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
+	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
+			&wpc->cow_seq);
 	if (error)
 		return error;
+	ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
+	       imap.br_startoff + imap.br_blockcount <= cow_fsb);
 	wpc->imap = imap;
 	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
 	return 0;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 756694219f77..43a7ff7f450c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -658,7 +658,8 @@  xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	int		whichfork,
 	xfs_off_t	offset,
-	xfs_bmbt_irec_t *imap)
+	xfs_bmbt_irec_t *imap,
+	unsigned int	*cow_seq)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb, last_block;
@@ -780,6 +781,8 @@  xfs_iomap_write_allocate(
 			if (error)
 				goto error0;
 
+			if (whichfork == XFS_COW_FORK)
+				*cow_seq = XFS_IFORK_PTR(ip, whichfork)->if_seq;
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		}
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 83474c9cede9..c6170548831b 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -14,7 +14,7 @@  struct xfs_bmbt_irec;
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
 int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
-			struct xfs_bmbt_irec *);
+			struct xfs_bmbt_irec *, unsigned int *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,