[v3] xfs: fix COW writeback race
diff mbox

Message ID 1484837472-14368-1-git-send-email-hch@lst.de
State Accepted
Headers show

Commit Message

Christoph Hellwig Jan. 19, 2017, 2:51 p.m. UTC
Due to the way how xfs_iomap_write_allocate tries to convert the whole
found extents from delalloc to real space we can run into a race
condition with multiple threads doing writes to this same extent.
For the non-COW case that is harmless as the only thing that can happen
is that we call xfs_bmapi_write on an extent that has already been
converted to a real allocation.  For COW writes where we move the extent
from the COW to the data fork after I/O completion the race is, however,
not quite as harmless.  In the worst case we are now calling
xfs_bmapi_write on a region that contains hole in the COW work, which
will trip up an assert in debug builds or lead to file system corruption
in non-debug builds.  This seems to be reproducible with workloads of
small O_DSYNC write, although so far I've not managed to come up with
a with an isolated reproducer.

The fix for the issue is relatively simple:  tell xfs_bmapi_write
that we are only asked to convert delayed allocations and skip holes
in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 44 ++++++++++++++++++++++++++++++++------------
 fs/xfs/libxfs/xfs_bmap.h |  6 +++++-
 fs/xfs/xfs_iomap.c       |  2 +-
 3 files changed, 38 insertions(+), 14 deletions(-)

Comments

Brian Foster Jan. 20, 2017, 11:57 a.m. UTC | #1
On Thu, Jan 19, 2017 at 03:51:12PM +0100, Christoph Hellwig wrote:
> Due to the way how xfs_iomap_write_allocate tries to convert the whole
> found extents from delalloc to real space we can run into a race
> condition with multiple threads doing writes to this same extent.
> For the non-COW case that is harmless as the only thing that can happen
> is that we call xfs_bmapi_write on an extent that has already been
> converted to a real allocation.  For COW writes where we move the extent
> from the COW to the data fork after I/O completion the race is, however,
> not quite as harmless.  In the worst case we are now calling
> xfs_bmapi_write on a region that contains hole in the COW work, which
> will trip up an assert in debug builds or lead to file system corruption
> in non-debug builds.  This seems to be reproducible with workloads of
> small O_DSYNC write, although so far I've not managed to come up with
> a with an isolated reproducer.
> 
> The fix for the issue is relatively simple:  tell xfs_bmapi_write
> that we are only asked to convert delayed allocations and skip holes
> in that case.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_bmap.c | 44 ++++++++++++++++++++++++++++++++------------
>  fs/xfs/libxfs/xfs_bmap.h |  6 +++++-
>  fs/xfs/xfs_iomap.c       |  2 +-
>  3 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 44773c9..ab82dd4 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4514,8 +4514,6 @@ xfs_bmapi_write(
>  	int			n;		/* current extent index */
>  	xfs_fileoff_t		obno;		/* old block number (offset) */
>  	int			whichfork;	/* data or attr fork */
> -	char			inhole;		/* current location is hole in file */
> -	char			wasdelay;	/* old extent was delayed */
>  
>  #ifdef DEBUG
>  	xfs_fileoff_t		orig_bno;	/* original block number value */
> @@ -4603,22 +4601,44 @@ xfs_bmapi_write(
>  	bma.firstblock = firstblock;
>  
>  	while (bno < end && n < *nmap) {
> -		inhole = eof || bma.got.br_startoff > bno;
> -		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
> +		bool			need_alloc = false, wasdelay = false;
>  
> -		/*
> -		 * Make sure we only reflink into a hole.
> -		 */
> -		if (flags & XFS_BMAPI_REMAP)
> -			ASSERT(inhole);
> -		if (flags & XFS_BMAPI_COWFORK)
> -			ASSERT(!inhole);
> +		/* in hole or beyoned EOF? */

Typo:			      beyond

Otherwise looks good:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +		if (eof || bma.got.br_startoff > bno) {
> +			if (flags & XFS_BMAPI_DELALLOC) {
> +				/*
> +				 * For the COW fork we can reasonably get a
> +				 * request for converting an extent that races
> +				 * with other threads already having converted
> +				 * part of it, as there converting COW to
> +				 * regular blocks is not protected using the
> +				 * IOLOCK.
> +				 */
> +				ASSERT(flags & XFS_BMAPI_COWFORK);
> +				if (!(flags & XFS_BMAPI_COWFORK)) {
> +					error = -EIO;
> +					goto error0;
> +				}
> +
> +				if (eof || bno >= end)
> +					break;
> +			} else {
> +				need_alloc = true;
> +			}
> +		} else {
> +			/*
> +			 * Make sure we only reflink into a hole.
> +			 */
> +			ASSERT(!(flags & XFS_BMAPI_REMAP));
> +			if (isnullstartblock(bma.got.br_startblock))
> +				wasdelay = true;
> +		}
>  
>  		/*
>  		 * First, deal with the hole before the allocated space
>  		 * that we found, if any.
>  		 */
> -		if (inhole || wasdelay) {
> +		if (need_alloc || wasdelay) {
>  			bma.eof = eof;
>  			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
>  			bma.wasdel = wasdelay;
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index cecd094..cdef87d 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -110,6 +110,9 @@ struct xfs_extent_free_item
>  /* Map something in the CoW fork. */
>  #define XFS_BMAPI_COWFORK	0x200
>  
> +/* Only convert delalloc space, don't allocate entirely new extents */
> +#define XFS_BMAPI_DELALLOC	0x400
> +
>  #define XFS_BMAPI_FLAGS \
>  	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
>  	{ XFS_BMAPI_METADATA,	"METADATA" }, \
> @@ -120,7 +123,8 @@ struct xfs_extent_free_item
>  	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
>  	{ XFS_BMAPI_ZERO,	"ZERO" }, \
>  	{ XFS_BMAPI_REMAP,	"REMAP" }, \
> -	{ XFS_BMAPI_COWFORK,	"COWFORK" }
> +	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
> +	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }
>  
>  
>  static inline int xfs_bmapi_aflag(int w)
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index ca137b7..7ee8629 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -795,7 +795,7 @@ xfs_iomap_write_allocate(
>  	xfs_trans_t	*tp;
>  	int		nimaps;
>  	int		error = 0;
> -	int		flags = 0;
> +	int		flags = XFS_BMAPI_DELALLOC;
>  	int		nres;
>  
>  	if (whichfork == XFS_COW_FORK)
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 20, 2017, 4:25 p.m. UTC | #2
On Thu, Jan 19, 2017 at 03:51:12PM +0100, Christoph Hellwig wrote:
> Due to the way how xfs_iomap_write_allocate tries to convert the whole
> found extents from delalloc to real space we can run into a race
> condition with multiple threads doing writes to this same extent.
> For the non-COW case that is harmless as the only thing that can happen
> is that we call xfs_bmapi_write on an extent that has already been
> converted to a real allocation.  For COW writes where we move the extent
> from the COW to the data fork after I/O completion the race is, however,
> not quite as harmless.  In the worst case we are now calling
> xfs_bmapi_write on a region that contains hole in the COW work, which
> will trip up an assert in debug builds or lead to file system corruption
> in non-debug builds.  This seems to be reproducible with workloads of
> small O_DSYNC write, although so far I've not managed to come up with
> a with an isolated reproducer.
> 
> The fix for the issue is relatively simple:  tell xfs_bmapi_write
> that we are only asked to convert delayed allocations and skip holes
> in that case.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

Applied, thanks.

--D

> ---
>  fs/xfs/libxfs/xfs_bmap.c | 44 ++++++++++++++++++++++++++++++++------------
>  fs/xfs/libxfs/xfs_bmap.h |  6 +++++-
>  fs/xfs/xfs_iomap.c       |  2 +-
>  3 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 44773c9..ab82dd4 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4514,8 +4514,6 @@ xfs_bmapi_write(
>  	int			n;		/* current extent index */
>  	xfs_fileoff_t		obno;		/* old block number (offset) */
>  	int			whichfork;	/* data or attr fork */
> -	char			inhole;		/* current location is hole in file */
> -	char			wasdelay;	/* old extent was delayed */
>  
>  #ifdef DEBUG
>  	xfs_fileoff_t		orig_bno;	/* original block number value */
> @@ -4603,22 +4601,44 @@ xfs_bmapi_write(
>  	bma.firstblock = firstblock;
>  
>  	while (bno < end && n < *nmap) {
> -		inhole = eof || bma.got.br_startoff > bno;
> -		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
> +		bool			need_alloc = false, wasdelay = false;
>  
> -		/*
> -		 * Make sure we only reflink into a hole.
> -		 */
> -		if (flags & XFS_BMAPI_REMAP)
> -			ASSERT(inhole);
> -		if (flags & XFS_BMAPI_COWFORK)
> -			ASSERT(!inhole);
> +		/* in hole or beyoned EOF? */
> +		if (eof || bma.got.br_startoff > bno) {
> +			if (flags & XFS_BMAPI_DELALLOC) {
> +				/*
> +				 * For the COW fork we can reasonably get a
> +				 * request for converting an extent that races
> +				 * with other threads already having converted
> +				 * part of it, as there converting COW to
> +				 * regular blocks is not protected using the
> +				 * IOLOCK.
> +				 */
> +				ASSERT(flags & XFS_BMAPI_COWFORK);
> +				if (!(flags & XFS_BMAPI_COWFORK)) {
> +					error = -EIO;
> +					goto error0;
> +				}
> +
> +				if (eof || bno >= end)
> +					break;
> +			} else {
> +				need_alloc = true;
> +			}
> +		} else {
> +			/*
> +			 * Make sure we only reflink into a hole.
> +			 */
> +			ASSERT(!(flags & XFS_BMAPI_REMAP));
> +			if (isnullstartblock(bma.got.br_startblock))
> +				wasdelay = true;
> +		}
>  
>  		/*
>  		 * First, deal with the hole before the allocated space
>  		 * that we found, if any.
>  		 */
> -		if (inhole || wasdelay) {
> +		if (need_alloc || wasdelay) {
>  			bma.eof = eof;
>  			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
>  			bma.wasdel = wasdelay;
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index cecd094..cdef87d 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -110,6 +110,9 @@ struct xfs_extent_free_item
>  /* Map something in the CoW fork. */
>  #define XFS_BMAPI_COWFORK	0x200
>  
> +/* Only convert delalloc space, don't allocate entirely new extents */
> +#define XFS_BMAPI_DELALLOC	0x400
> +
>  #define XFS_BMAPI_FLAGS \
>  	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
>  	{ XFS_BMAPI_METADATA,	"METADATA" }, \
> @@ -120,7 +123,8 @@ struct xfs_extent_free_item
>  	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
>  	{ XFS_BMAPI_ZERO,	"ZERO" }, \
>  	{ XFS_BMAPI_REMAP,	"REMAP" }, \
> -	{ XFS_BMAPI_COWFORK,	"COWFORK" }
> +	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
> +	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }
>  
>  
>  static inline int xfs_bmapi_aflag(int w)
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index ca137b7..7ee8629 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -795,7 +795,7 @@ xfs_iomap_write_allocate(
>  	xfs_trans_t	*tp;
>  	int		nimaps;
>  	int		error = 0;
> -	int		flags = 0;
> +	int		flags = XFS_BMAPI_DELALLOC;
>  	int		nres;
>  
>  	if (whichfork == XFS_COW_FORK)
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Jan. 23, 2017, 12:18 p.m. UTC | #3
On Fri, Jan 20, 2017 at 08:25:07AM -0800, Darrick J. Wong wrote:
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Applied, thanks.

I haven't seen it in any tree yet.  I'd love to see it in -rc6
and include it in my next -stable backport.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 23, 2017, 6:56 p.m. UTC | #4
On Mon, Jan 23, 2017 at 01:18:08PM +0100, Christoph Hellwig wrote:
> On Fri, Jan 20, 2017 at 08:25:07AM -0800, Darrick J. Wong wrote:
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Applied, thanks.
> 
> I haven't seen it in any tree yet.  I'd love to see it in -rc6
> and include it in my next -stable backport.

Yeah, it'll be in the rc fixes branch tonight.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 44773c9..ab82dd4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4514,8 +4514,6 @@  xfs_bmapi_write(
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
 	int			whichfork;	/* data or attr fork */
-	char			inhole;		/* current location is hole in file */
-	char			wasdelay;	/* old extent was delayed */
 
 #ifdef DEBUG
 	xfs_fileoff_t		orig_bno;	/* original block number value */
@@ -4603,22 +4601,44 @@  xfs_bmapi_write(
 	bma.firstblock = firstblock;
 
 	while (bno < end && n < *nmap) {
-		inhole = eof || bma.got.br_startoff > bno;
-		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+		bool			need_alloc = false, wasdelay = false;
 
-		/*
-		 * Make sure we only reflink into a hole.
-		 */
-		if (flags & XFS_BMAPI_REMAP)
-			ASSERT(inhole);
-		if (flags & XFS_BMAPI_COWFORK)
-			ASSERT(!inhole);
+		/* in hole or beyoned EOF? */
+		if (eof || bma.got.br_startoff > bno) {
+			if (flags & XFS_BMAPI_DELALLOC) {
+				/*
+				 * For the COW fork we can reasonably get a
+				 * request for converting an extent that races
+				 * with other threads already having converted
+				 * part of it, as there converting COW to
+				 * regular blocks is not protected using the
+				 * IOLOCK.
+				 */
+				ASSERT(flags & XFS_BMAPI_COWFORK);
+				if (!(flags & XFS_BMAPI_COWFORK)) {
+					error = -EIO;
+					goto error0;
+				}
+
+				if (eof || bno >= end)
+					break;
+			} else {
+				need_alloc = true;
+			}
+		} else {
+			/*
+			 * Make sure we only reflink into a hole.
+			 */
+			ASSERT(!(flags & XFS_BMAPI_REMAP));
+			if (isnullstartblock(bma.got.br_startblock))
+				wasdelay = true;
+		}
 
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if (inhole || wasdelay) {
+		if (need_alloc || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cecd094..cdef87d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -110,6 +110,9 @@  struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK	0x200
 
+/* Only convert delalloc space, don't allocate entirely new extents */
+#define XFS_BMAPI_DELALLOC	0x400
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -120,7 +123,8 @@  struct xfs_extent_free_item
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
-	{ XFS_BMAPI_COWFORK,	"COWFORK" }
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
+	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }
 
 
 static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ca137b7..7ee8629 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -795,7 +795,7 @@  xfs_iomap_write_allocate(
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
-	int		flags = 0;
+	int		flags = XFS_BMAPI_DELALLOC;
 	int		nres;
 
 	if (whichfork == XFS_COW_FORK)