diff mbox series

[3/8] xfs: don't use delalloc extents for COW on files with extsize hints

Message ID 20190218091827.12619-4-hch@lst.de (mailing list archive)
State Accepted, archived
Headers show
Series [1/8] xfs: make xfs_bmbt_to_iomap more useful | expand

Commit Message

Christoph Hellwig Feb. 18, 2019, 9:18 a.m. UTC
While using delalloc for extsize hints is generally a good idea, the
current code that does so only for COW doesn't help us much and creates
a lot of special cases.  Switch it to use real allocations like we
do for direct I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
 fs/xfs/xfs_reflink.c | 10 +++++++++-
 fs/xfs/xfs_reflink.h |  3 ++-
 3 files changed, 28 insertions(+), 13 deletions(-)

Comments

Darrick J. Wong Feb. 19, 2019, 5:17 a.m. UTC | #1
On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> While using delalloc for extsize hints is generally a good idea, the
> current code that does so only for COW doesn't help us much and creates
> a lot of special cases.  Switch it to use real allocations like we
> do for direct I/O.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
>  fs/xfs/xfs_reflink.c | 10 +++++++++-
>  fs/xfs/xfs_reflink.h |  3 ++-
>  3 files changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index a7599b084571..19a3331b4a56 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
>  	 * been done up front, so we don't need to do them here.
>  	 */
>  	if (xfs_is_reflink_inode(ip)) {
> +		struct xfs_bmbt_irec	orig = imap;
> +
>  		/* if zeroing doesn't need COW allocation, then we are done. */
>  		if ((flags & IOMAP_ZERO) &&
>  		    !needs_cow_for_zeroing(&imap, nimaps))
>  			goto out_found;
>  
> -		if (flags & IOMAP_DIRECT) {
> -			/* may drop and re-acquire the ilock */

AFAICT we can still cycle the ilock in _reflink_allocate_cow, so we
should retain the comment.  I'll fix that in my tree if I end up pulling
in this series...

Otherwise looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D


> -			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
> -					&lockmode);
> -			if (error)
> -				goto out_unlock;
> -		} else {
> -			error = xfs_reflink_reserve_cow(ip, &imap);
> -			if (error)
> -				goto out_unlock;
> -		}
> +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> +						 flags);
> +		if (error)
> +			goto out_unlock;
> +
> +		/*
> +		 * For buffered writes we need to report the address of the
> +		 * previous block (if there was any) so that the higher level
> +		 * write code can perform read-modify-write operations.  For
> +		 * direct I/O code, which must be block aligned we need to
> +		 * report the newly allocated address.
> +		 */
> +		if (!(flags & IOMAP_DIRECT) &&
> +		    orig.br_startblock != HOLESTARTBLOCK)
> +			imap = orig;
>  
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 2babc2cbe103..8a5353daf9ab 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
>  	struct xfs_bmbt_irec	*imap,
>  	bool			*shared,
> -	uint			*lockmode)
> +	uint			*lockmode,
> +	unsigned		iomap_flags)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
>  	if (nimaps == 0)
>  		return -ENOSPC;
>  convert:
> +	/*
> +	 * COW fork extents are supposed to remain unwritten until we're ready
> +	 * to initiate a disk write.  For direct I/O we are going to write the
> +	 * data and need the conversion, but for buffered writes we're done.
> +	 */
> +	if (!(iomap_flags & IOMAP_DIRECT))
> +		return 0;
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
>  
>  out_unreserve:
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 6d73daef1f13..70d68a1a9b49 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> +		unsigned iomap_flags);
>  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
>  
> -- 
> 2.20.1
>
Brian Foster Feb. 21, 2019, 5:58 p.m. UTC | #2
On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> While using delalloc for extsize hints is generally a good idea, the
> current code that does so only for COW doesn't help us much and creates
> a lot of special cases.  Switch it to use real allocations like we
> do for direct I/O.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
>  fs/xfs/xfs_reflink.c | 10 +++++++++-
>  fs/xfs/xfs_reflink.h |  3 ++-
>  3 files changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index a7599b084571..19a3331b4a56 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
>  	 * been done up front, so we don't need to do them here.
>  	 */
>  	if (xfs_is_reflink_inode(ip)) {
> +		struct xfs_bmbt_irec	orig = imap;
> +
...
> +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> +						 flags);
> +		if (error)
> +			goto out_unlock;
> +
> +		/*
> +		 * For buffered writes we need to report the address of the
> +		 * previous block (if there was any) so that the higher level
> +		 * write code can perform read-modify-write operations.  For
> +		 * direct I/O code, which must be block aligned we need to
> +		 * report the newly allocated address.
> +		 */
> +		if (!(flags & IOMAP_DIRECT) &&
> +		    orig.br_startblock != HOLESTARTBLOCK)
> +			imap = orig;

I find the logic here kind of confusing. The buffered write (reflink)
path basically expects to allocated COW blocks over an existing shared
extent. It thus makes no modification to the caller's imap because it
(read-modify-)writes into cache and writeback determines where to send
the I/O. Why not follow the same flow here? For example:

	cmap = orig;
	error = xfs_reflink_allocate_cow(ip, &cmap, ...);
	/* ... */
	if ((flags & IOMAP_DIRECT) || (imap.br_startblock == HOLESTARTBLOCK))
		imap = cmap;

And also, what's the point of the hole check.. to avoid an unnecessary
data fork allocation below? That should be explained in the comment.

>  
>  		end_fsb = imap.br_startoff + imap.br_blockcount;
>  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 2babc2cbe103..8a5353daf9ab 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
>  	struct xfs_inode	*ip,
>  	struct xfs_bmbt_irec	*imap,
>  	bool			*shared,
> -	uint			*lockmode)
> +	uint			*lockmode,
> +	unsigned		iomap_flags)

I don't see why a lower level reflink mechanism needs to care about
direct I/O or not. IMO this should just be a 'bool convert' param.

Brian

>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
>  	if (nimaps == 0)
>  		return -ENOSPC;
>  convert:
> +	/*
> +	 * COW fork extents are supposed to remain unwritten until we're ready
> +	 * to initiate a disk write.  For direct I/O we are going to write the
> +	 * data and need the conversion, but for buffered writes we're done.
> +	 */
> +	if (!(iomap_flags & IOMAP_DIRECT))
> +		return 0;
>  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
>  
>  out_unreserve:
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 6d73daef1f13..70d68a1a9b49 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap);
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> +		unsigned iomap_flags);
>  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t count);
>  
> -- 
> 2.20.1
>
Darrick J. Wong Feb. 21, 2019, 10:56 p.m. UTC | #3
On Thu, Feb 21, 2019 at 12:58:17PM -0500, Brian Foster wrote:
> On Mon, Feb 18, 2019 at 10:18:22AM +0100, Christoph Hellwig wrote:
> > While using delalloc for extsize hints is generally a good idea, the
> > current code that does so only for COW doesn't help us much and creates
> > a lot of special cases.  Switch it to use real allocations like we
> > do for direct I/O.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/xfs_iomap.c   | 28 +++++++++++++++++-----------
> >  fs/xfs/xfs_reflink.c | 10 +++++++++-
> >  fs/xfs/xfs_reflink.h |  3 ++-
> >  3 files changed, 28 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index a7599b084571..19a3331b4a56 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -918,22 +918,28 @@ xfs_file_iomap_begin(
> >  	 * been done up front, so we don't need to do them here.
> >  	 */
> >  	if (xfs_is_reflink_inode(ip)) {
> > +		struct xfs_bmbt_irec	orig = imap;
> > +
> ...
> > +		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
> > +						 flags);
> > +		if (error)
> > +			goto out_unlock;
> > +
> > +		/*
> > +		 * For buffered writes we need to report the address of the
> > +		 * previous block (if there was any) so that the higher level
> > +		 * write code can perform read-modify-write operations.  For
> > +		 * direct I/O code, which must be block aligned we need to
> > +		 * report the newly allocated address.
> > +		 */
> > +		if (!(flags & IOMAP_DIRECT) &&
> > +		    orig.br_startblock != HOLESTARTBLOCK)
> > +			imap = orig;
> 
> I find the logic here kind of confusing. The buffered write (reflink)
> path basically expects to allocated COW blocks over an existing shared
> extent. It thus makes no modification to the caller's imap because it
> (read-modify-)writes into cache and writeback determines where to send
> the I/O. Why not follow the same flow here? For example:
> 
> 	cmap = orig;
> 	error = xfs_reflink_allocate_cow(ip, &cmap, ...);
> 	/* ... */
> 	if ((flags & IOMAP_DIRECT) || (imap.br_startblock == HOLESTARTBLOCK))
> 		imap = cmap;

Admittedly, I think that flows better.

> And also, what's the point of the hole check.. to avoid an unnecessary
> data fork allocation below? That should be explained in the comment.
>
> >  
> >  		end_fsb = imap.br_startoff + imap.br_blockcount;
> >  		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 2babc2cbe103..8a5353daf9ab 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
> >  	struct xfs_inode	*ip,
> >  	struct xfs_bmbt_irec	*imap,
> >  	bool			*shared,
> > -	uint			*lockmode)
> > +	uint			*lockmode,
> > +	unsigned		iomap_flags)
> 
> I don't see why a lower level reflink mechanism needs to care about
> direct I/O or not. IMO this should just be a 'bool convert' param.

I forgot I'd complained about this the last time I read this patch.

Well maybe I'll just fix it, seeing as I already pushed the whole lot to
for-next. :/

--D

> Brian
> 
> >  {
> >  	struct xfs_mount	*mp = ip->i_mount;
> >  	xfs_fileoff_t		offset_fsb = imap->br_startoff;
> > @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow(
> >  	if (nimaps == 0)
> >  		return -ENOSPC;
> >  convert:
> > +	/*
> > +	 * COW fork extents are supposed to remain unwritten until we're ready
> > +	 * to initiate a disk write.  For direct I/O we are going to write the
> > +	 * data and need the conversion, but for buffered writes we're done.
> > +	 */
> > +	if (!(iomap_flags & IOMAP_DIRECT))
> > +		return 0;
> >  	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
> >  
> >  out_unreserve:
> > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> > index 6d73daef1f13..70d68a1a9b49 100644
> > --- a/fs/xfs/xfs_reflink.h
> > +++ b/fs/xfs/xfs_reflink.h
> > @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
> >  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
> >  		struct xfs_bmbt_irec *imap);
> >  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
> > -		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
> > +		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> > +		unsigned iomap_flags);
> >  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
> >  		xfs_off_t count);
> >  
> > -- 
> > 2.20.1
> >
Christoph Hellwig Feb. 22, 2019, 2:16 p.m. UTC | #4
On Thu, Feb 21, 2019 at 12:58:17PM -0500, Brian Foster wrote:
> > +		/*
> > +		 * For buffered writes we need to report the address of the
> > +		 * previous block (if there was any) so that the higher level
> > +		 * write code can perform read-modify-write operations.  For
> > +		 * direct I/O code, which must be block aligned we need to
> > +		 * report the newly allocated address.
> > +		 */
> > +		if (!(flags & IOMAP_DIRECT) &&
> > +		    orig.br_startblock != HOLESTARTBLOCK)
> > +			imap = orig;
> 
> I find the logic here kind of confusing. The buffered write (reflink)
> path basically expects to allocated COW blocks over an existing shared
> extent. It thus makes no modification to the caller's imap because it
> (read-modify-)writes into cache and writeback determines where to send
> the I/O. Why not follow the same flow here? For example:

This is to optimize for the command case.  Both in direct I/O being
actually common over extent size hints, and also over this being
the sensible behavior while the buffered I/O behavior of returning
the old map is somewhat odd.

I have outstanding todo items to switch extent size hint based buffered
I/O to use delalloc reservations, and to clean up how the iomap code
currently hacks around the lack of a clear interface for the
read-modify-write cycles in buffered I/O, both of which would remove
this hack above without touching the surrounding code.

> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index 2babc2cbe103..8a5353daf9ab 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow(
> >  	struct xfs_inode	*ip,
> >  	struct xfs_bmbt_irec	*imap,
> >  	bool			*shared,
> > -	uint			*lockmode)
> > +	uint			*lockmode,
> > +	unsigned		iomap_flags)
> 
> I don't see why a lower level reflink mechanism needs to care about
> direct I/O or not. IMO this should just be a 'bool convert' param.

My memory is a little vague, but I think Darrick preferred it this way.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a7599b084571..19a3331b4a56 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -918,22 +918,28 @@  xfs_file_iomap_begin(
 	 * been done up front, so we don't need to do them here.
 	 */
 	if (xfs_is_reflink_inode(ip)) {
+		struct xfs_bmbt_irec	orig = imap;
+
 		/* if zeroing doesn't need COW allocation, then we are done. */
 		if ((flags & IOMAP_ZERO) &&
 		    !needs_cow_for_zeroing(&imap, nimaps))
 			goto out_found;
 
-		if (flags & IOMAP_DIRECT) {
-			/* may drop and re-acquire the ilock */
-			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
-					&lockmode);
-			if (error)
-				goto out_unlock;
-		} else {
-			error = xfs_reflink_reserve_cow(ip, &imap);
-			if (error)
-				goto out_unlock;
-		}
+		error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode,
+						 flags);
+		if (error)
+			goto out_unlock;
+
+		/*
+		 * For buffered writes we need to report the address of the
+		 * previous block (if there was any) so that the higher level
+		 * write code can perform read-modify-write operations.  For
+		 * direct I/O code, which must be block aligned we need to
+		 * report the newly allocated address.
+		 */
+		if (!(flags & IOMAP_DIRECT) &&
+		    orig.br_startblock != HOLESTARTBLOCK)
+			imap = orig;
 
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 2babc2cbe103..8a5353daf9ab 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -397,7 +397,8 @@  xfs_reflink_allocate_cow(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared,
-	uint			*lockmode)
+	uint			*lockmode,
+	unsigned		iomap_flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
@@ -471,6 +472,13 @@  xfs_reflink_allocate_cow(
 	if (nimaps == 0)
 		return -ENOSPC;
 convert:
+	/*
+	 * COW fork extents are supposed to remain unwritten until we're ready
+	 * to initiate a disk write.  For direct I/O we are going to write the
+	 * data and need the conversion, but for buffered writes we're done.
+	 */
+	if (!(iomap_flags & IOMAP_DIRECT))
+		return 0;
 	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
 
 out_unreserve:
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..70d68a1a9b49 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -15,7 +15,8 @@  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+		unsigned iomap_flags);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);