diff mbox series

[15/16] btrfs: refactor the zoned device handling in cow_file_range

Message ID 20230531060505.468704-16-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/16] btrfs: fix range_end calculation in extent_write_locked_range | expand

Commit Message

Christoph Hellwig May 31, 2023, 6:05 a.m. UTC
Handling of the done_offset to cow_file_range is a bit confusing, as
it is not updated at all when the function succeeds, and the -EAGAIN
status is used bother for the case where we need to wait for a zone
finish and the one where the allocation was partially successful.

Change the calling convention so that done_offset is always updated,
and 0 is returned if some allocation was successful (partial allocation
can still only happen for zoned devices), and -EAGAIN is only returned
when the caller needs to wait for a zone finish.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/inode.c | 53 ++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

Comments

Naohiro Aota June 6, 2023, 2:20 p.m. UTC | #1
On Wed, May 31, 2023 at 08:05:04AM +0200, Christoph Hellwig wrote:
> Handling of the done_offset to cow_file_range is a bit confusing, as
> it is not updated at all when the function succeeds, and the -EAGAIN
> status is used bother for the case where we need to wait for a zone
> finish and the one where the allocation was partially successful.
> 
> Change the calling convention so that done_offset is always updated,
> and 0 is returned if some allocation was successful (partial allocation
> can still only happen for zoned devices), and -EAGAIN is only returned
> when the caller needs to wait for a zone finish.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/btrfs/inode.c | 53 ++++++++++++++++++++++++------------------------
>  1 file changed, 27 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 68ae20a3f785e3..a12d4f77859706 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1403,7 +1403,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
>  	unsigned clear_bits;
>  	unsigned long page_ops;
>  	bool extent_reserved = false;
> -	int ret = 0;
> +	int ret;
>  
>  	if (btrfs_is_free_space_inode(inode)) {
>  		ret = -EINVAL;
> @@ -1462,7 +1462,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
>  			 * inline extent or a compressed extent.
>  			 */
>  			unlock_page(locked_page);
> -			goto out;
> +			goto done;
>  		} else if (ret < 0) {
>  			goto out_unlock;
>  		}
> @@ -1491,6 +1491,23 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
>  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
>  					   min_alloc_size, 0, alloc_hint,
>  					   &ins, 1, 1);
> +		if (ret == -EAGAIN) {

Yeah, moving this check here is reasonable. Currently, we check the return
value of cow_file_range_inline(), btrfs_reserve_extent(), and
btrfs_reloc_clone_csums(), but -EAGAIN is only meaningful for
btrfs_reserve_extent().

> +			/*
> +			 * For zoned devices, let the caller retry after writing
> +			 * out the already allocated regions or waiting for a
> +			 * zone to finish if no allocation was possible at all.
> +			 *
> +			 * Else convert to -ENOSPC since the caller cannot
> +			 * retry.
> +			 */
> +			if (btrfs_is_zoned(fs_info)) {
> +				if (start == orig_start)
> +					return -EAGAIN;
> +				*done_offset = start - 1;

This will hit a null pointer dereference when it is called from
submit_uncompressed_range(). Well, that means we need to implement the
partial writing also in submit_uncompressed_range().

BTW, we also need to handle -EAGAIN from btrfs_reserve_extent() in
btrfs_new_extent_direct() as it currently returns -EAGAIN to the userland.

> +				return 0;
> +			}

Once the above issue is solved, we can just ASSERT(btrfs_is_zoned()) for
ret == -EAGAIN case.

> +			ret = -ENOSPC;
> +		}
>  		if (ret < 0)
>  			goto out_unlock;
>  		cur_alloc_size = ins.offset;
> @@ -1571,8 +1588,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
>  		if (ret)
>  			goto out_unlock;
>  	}
> -out:
> -	return ret;
> +done:
> +	if (done_offset)
> +		*done_offset = end;
> +	return 0;
>  
>  out_drop_extent_cache:
>  	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
> @@ -1580,21 +1599,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
>  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
>  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
>  out_unlock:
> -	/*
> -	 * If done_offset is non-NULL and ret == -EAGAIN, we expect the
> -	 * caller to write out the successfully allocated region and retry.
> -	 */
> -	if (done_offset && ret == -EAGAIN) {
> -		if (orig_start < start)
> -			*done_offset = start - 1;
> -		else
> -			*done_offset = start;
> -		return ret;
> -	} else if (ret == -EAGAIN) {
> -		/* Convert to -ENOSPC since the caller cannot retry. */
> -		ret = -ENOSPC;
> -	}
> -
>  	/*
>  	 * Now, we have three regions to clean up:
>  	 *
> @@ -1826,23 +1830,20 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
>  	while (start <= end) {
>  		ret = cow_file_range(inode, locked_page, start, end, page_started,
>  				     nr_written, 0, &done_offset);
> -		if (ret && ret != -EAGAIN)
> -			return ret;
> -
>  		if (*page_started) {
>  			ASSERT(ret == 0);
>  			return 0;
>  		}
> +		if (ret == -EAGAIN) {
> +			ASSERT(btrfs_is_zoned(inode->root->fs_info));

Is this necessary? run_delalloc_zoned() should be called only for zoned
mode anyway.

>  
> -		if (ret == 0)
> -			done_offset = end;
> -
> -		if (done_offset == start) {
>  			wait_on_bit_io(&inode->root->fs_info->flags,
>  				       BTRFS_FS_NEED_ZONE_FINISH,
>  				       TASK_UNINTERRUPTIBLE);
>  			continue;
>  		}
> +		if (ret)
> +			return ret;
>  
>  		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
>  					  done_offset, wbc);
> -- 
> 2.39.2
>
Christoph Hellwig June 7, 2023, 7:27 a.m. UTC | #2
On Tue, Jun 06, 2023 at 02:20:50PM +0000, Naohiro Aota wrote:
> > +			/*
> > +			 * For zoned devices, let the caller retry after writing
> > +			 * out the already allocated regions or waiting for a
> > +			 * zone to finish if no allocation was possible at all.
> > +			 *
> > +			 * Else convert to -ENOSPC since the caller cannot
> > +			 * retry.
> > +			 */
> > +			if (btrfs_is_zoned(fs_info)) {
> > +				if (start == orig_start)
> > +					return -EAGAIN;
> > +				*done_offset = start - 1;
> 
> This will hit a null pointer dereference when it is called from
> submit_uncompressed_range(). Well, that means we need to implement the
> partial writing also in submit_uncompressed_range().

I think we need to do that anyway, don't we?  As far as I can tell
submit_uncompressed_range is simply broken right now on zoned devices
if cow_file_range fails to allocate all blocks.

> BTW, we also need to handle -EAGAIN from btrfs_reserve_extent() in
> btrfs_new_extent_direct() as it currently returns -EAGAIN to the userland.

Indeed.

> >  			ASSERT(ret == 0);
> >  			return 0;
> >  		}
> > +		if (ret == -EAGAIN) {
> > +			ASSERT(btrfs_is_zoned(inode->root->fs_info));
> 
> Is this necessary? run_delalloc_zoned() should be called only for zoned
> mode anyway.

True.
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 68ae20a3f785e3..a12d4f77859706 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1403,7 +1403,7 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 	unsigned clear_bits;
 	unsigned long page_ops;
 	bool extent_reserved = false;
-	int ret = 0;
+	int ret;
 
 	if (btrfs_is_free_space_inode(inode)) {
 		ret = -EINVAL;
@@ -1462,7 +1462,7 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 			 * inline extent or a compressed extent.
 			 */
 			unlock_page(locked_page);
-			goto out;
+			goto done;
 		} else if (ret < 0) {
 			goto out_unlock;
 		}
@@ -1491,6 +1491,23 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 					   min_alloc_size, 0, alloc_hint,
 					   &ins, 1, 1);
+		if (ret == -EAGAIN) {
+			/*
+			 * For zoned devices, let the caller retry after writing
+			 * out the already allocated regions or waiting for a
+			 * zone to finish if no allocation was possible at all.
+			 *
+			 * Else convert to -ENOSPC since the caller cannot
+			 * retry.
+			 */
+			if (btrfs_is_zoned(fs_info)) {
+				if (start == orig_start)
+					return -EAGAIN;
+				*done_offset = start - 1;
+				return 0;
+			}
+			ret = -ENOSPC;
+		}
 		if (ret < 0)
 			goto out_unlock;
 		cur_alloc_size = ins.offset;
@@ -1571,8 +1588,10 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 		if (ret)
 			goto out_unlock;
 	}
-out:
-	return ret;
+done:
+	if (done_offset)
+		*done_offset = end;
+	return 0;
 
 out_drop_extent_cache:
 	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
@@ -1580,21 +1599,6 @@  static noinline int cow_file_range(struct btrfs_inode *inode,
 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 out_unlock:
-	/*
-	 * If done_offset is non-NULL and ret == -EAGAIN, we expect the
-	 * caller to write out the successfully allocated region and retry.
-	 */
-	if (done_offset && ret == -EAGAIN) {
-		if (orig_start < start)
-			*done_offset = start - 1;
-		else
-			*done_offset = start;
-		return ret;
-	} else if (ret == -EAGAIN) {
-		/* Convert to -ENOSPC since the caller cannot retry. */
-		ret = -ENOSPC;
-	}
-
 	/*
 	 * Now, we have three regions to clean up:
 	 *
@@ -1826,23 +1830,20 @@  static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
 	while (start <= end) {
 		ret = cow_file_range(inode, locked_page, start, end, page_started,
 				     nr_written, 0, &done_offset);
-		if (ret && ret != -EAGAIN)
-			return ret;
-
 		if (*page_started) {
 			ASSERT(ret == 0);
 			return 0;
 		}
+		if (ret == -EAGAIN) {
+			ASSERT(btrfs_is_zoned(inode->root->fs_info));
 
-		if (ret == 0)
-			done_offset = end;
-
-		if (done_offset == start) {
 			wait_on_bit_io(&inode->root->fs_info->flags,
 				       BTRFS_FS_NEED_ZONE_FINISH,
 				       TASK_UNINTERRUPTIBLE);
 			continue;
 		}
+		if (ret)
+			return ret;
 
 		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
 					  done_offset, wbc);