diff mbox series

[05/20] btrfs: simplify extent buffer reading

Message ID 20230309090526.332550-6-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/20] btrfs: mark extent_buffer_under_io static | expand

Commit Message

Christoph Hellwig March 9, 2023, 9:05 a.m. UTC
The btrfs_bio_ctrl machinery is overkill for reading extent_buffers
as we always operate on PAGE SIZE chunks (or one smaller one for the
subpage case) that are contigous and are guaranteed to fit into a
single bio.  Replace it with open coded btrfs_bio_alloc, __bio_add_page
and btrfs_submit_bio calls in a helper function shared between
the subpage and node size >= PAGE_SIZE cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/extent_io.c | 99 ++++++++++++++++----------------------------
 1 file changed, 36 insertions(+), 63 deletions(-)

Comments

Johannes Thumshirn March 9, 2023, 11:59 a.m. UTC | #1
Looks good,
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Qu Wenruo March 10, 2023, 7:42 a.m. UTC | #2
On 2023/3/9 17:05, Christoph Hellwig wrote:
> The btrfs_bio_ctrl machinery is overkill for reading extent_buffers
> as we always operate on PAGE SIZE chunks (or one smaller one for the
> subpage case) that are contigous and are guaranteed to fit into a
> single bio.

This is the legacy left by older stripe boundary based bio split code.
(Please note that, metadata crossing stripe boundaries is not ideal and 
is very rare nowadays, but we should still support it).

But now we have btrfs_submit_bio() handling such cases, thus it should 
be fine.

>  Replace it with open coded btrfs_bio_alloc, __bio_add_page
> and btrfs_submit_bio calls in a helper function shared between
> the subpage and node size >= PAGE_SIZE cases.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu
> ---
>   fs/btrfs/extent_io.c | 99 ++++++++++++++++----------------------------
>   1 file changed, 36 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 26d8162bee000d..5169e73ffea647 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -98,22 +98,12 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
>    */
>   struct btrfs_bio_ctrl {
>   	struct btrfs_bio *bbio;
> -	int mirror_num;
>   	enum btrfs_compression_type compress_type;
>   	u32 len_to_oe_boundary;
>   	blk_opf_t opf;
>   	btrfs_bio_end_io_t end_io_func;
>   	struct writeback_control *wbc;
>   
> -	/*
> -	 * This is for metadata read, to provide the extra needed verification
> -	 * info.  This has to be provided for submit_one_bio(), as
> -	 * submit_one_bio() can submit a bio if it ends at stripe boundary.  If
> -	 * no such parent_check is provided, the metadata can hit false alert at
> -	 * endio time.
> -	 */
> -	struct btrfs_tree_parent_check *parent_check;
> -
>   	/*
>   	 * Tell writepage not to lock the state bits for this range, it still
>   	 * does the unlocking.
> @@ -124,7 +114,6 @@ struct btrfs_bio_ctrl {
>   static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
>   {
>   	struct btrfs_bio *bbio = bio_ctrl->bbio;
> -	int mirror_num = bio_ctrl->mirror_num;
>   
>   	if (!bbio)
>   		return;
> @@ -132,25 +121,14 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
>   	/* Caller should ensure the bio has at least some range added */
>   	ASSERT(bbio->bio.bi_iter.bi_size);
>   
> -	if (!is_data_inode(&bbio->inode->vfs_inode)) {
> -		if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
> -			/*
> -			 * For metadata read, we should have the parent_check,
> -			 * and copy it to bbio for metadata verification.
> -			 */
> -			ASSERT(bio_ctrl->parent_check);
> -			memcpy(&bbio->parent_check,
> -			       bio_ctrl->parent_check,
> -			       sizeof(struct btrfs_tree_parent_check));
> -		}
> +	if (!is_data_inode(&bbio->inode->vfs_inode))
>   		bbio->bio.bi_opf |= REQ_META;
> -	}
>   
>   	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
>   	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
> -		btrfs_submit_compressed_read(bbio, mirror_num);
> +		btrfs_submit_compressed_read(bbio, 0);
>   	else
> -		btrfs_submit_bio(bbio, mirror_num);
> +		btrfs_submit_bio(bbio, 0);
>   
>   	/* The bbio is owned by the end_io handler now */
>   	bio_ctrl->bbio = NULL;
> @@ -4241,6 +4219,36 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
>   	}
>   }
>   
> +static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
> +				       struct btrfs_tree_parent_check *check)
> +{
> +	int num_pages = num_extent_pages(eb), i;
> +	struct btrfs_bio *bbio;
> +
> +	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
> +	eb->read_mirror = 0;
> +	atomic_set(&eb->io_pages, num_pages);
> +	check_buffer_tree_ref(eb);
> +
> +	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
> +			       REQ_OP_READ | REQ_META,
> +			       BTRFS_I(eb->fs_info->btree_inode),
> +			       end_bio_extent_readpage, NULL);
> +	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
> +	bbio->file_offset = eb->start;
> +	memcpy(&bbio->parent_check, check, sizeof(*check));
> +	if (eb->fs_info->nodesize < PAGE_SIZE) {
> +		__bio_add_page(&bbio->bio, eb->pages[0], eb->len,
> +			       eb->start - page_offset(eb->pages[0]));
> +	} else {
> +		for (i = 0; i < num_pages; i++) {
> +			ClearPageError(eb->pages[i]);
> +			__bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
> +		}
> +	}
> +	btrfs_submit_bio(bbio, mirror_num);
> +}
> +
>   static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
>   				      int mirror_num,
>   				      struct btrfs_tree_parent_check *check)
> @@ -4249,11 +4257,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
>   	struct extent_io_tree *io_tree;
>   	struct page *page = eb->pages[0];
>   	struct extent_state *cached_state = NULL;
> -	struct btrfs_bio_ctrl bio_ctrl = {
> -		.opf = REQ_OP_READ,
> -		.mirror_num = mirror_num,
> -		.parent_check = check,
> -	};
>   	int ret;
>   
>   	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
> @@ -4281,18 +4284,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
>   		return 0;
>   	}
>   
> -	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
> -	eb->read_mirror = 0;
> -	atomic_set(&eb->io_pages, 1);
> -	check_buffer_tree_ref(eb);
> -	bio_ctrl.end_io_func = end_bio_extent_readpage;
> -
>   	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
> -
>   	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
> -	submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
> -			   eb->start - page_offset(page));
> -	submit_one_bio(&bio_ctrl);
> +
> +	__read_extent_buffer_pages(eb, mirror_num, check);
>   	if (wait != WAIT_COMPLETE) {
>   		free_extent_state(cached_state);
>   		return 0;
> @@ -4313,11 +4308,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
>   	int locked_pages = 0;
>   	int all_uptodate = 1;
>   	int num_pages;
> -	struct btrfs_bio_ctrl bio_ctrl = {
> -		.opf = REQ_OP_READ,
> -		.mirror_num = mirror_num,
> -		.parent_check = check,
> -	};
>   
>   	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>   		return 0;
> @@ -4367,24 +4357,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
>   		goto unlock_exit;
>   	}
>   
> -	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
> -	eb->read_mirror = 0;
> -	atomic_set(&eb->io_pages, num_pages);
> -	/*
> -	 * It is possible for release_folio to clear the TREE_REF bit before we
> -	 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
> -	 */
> -	check_buffer_tree_ref(eb);
> -	bio_ctrl.end_io_func = end_bio_extent_readpage;
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -
> -		ClearPageError(page);
> -		submit_extent_page(&bio_ctrl, page_offset(page), page,
> -				   PAGE_SIZE, 0);
> -	}
> -
> -	submit_one_bio(&bio_ctrl);
> +	__read_extent_buffer_pages(eb, mirror_num, check);
>   
>   	if (wait != WAIT_COMPLETE)
>   		return 0;
Christoph Hellwig March 10, 2023, 7:47 a.m. UTC | #3
On Fri, Mar 10, 2023 at 03:42:04PM +0800, Qu Wenruo wrote:
> This is the legacy left by older stripe boundary based bio split code.
> (Please note that, metadata crossing stripe boundaries is not ideal and is 
> very rare nowadays, but we should still support it).

How can metadata cross a stripe boundary?
Qu Wenruo March 10, 2023, 8:02 a.m. UTC | #4
On 2023/3/10 15:47, Christoph Hellwig wrote:
> On Fri, Mar 10, 2023 at 03:42:04PM +0800, Qu Wenruo wrote:
>> This is the legacy left by older stripe boundary based bio split code.
>> (Please note that, metadata crossing stripe boundaries is not ideal and is
>> very rare nowadays, but we should still support it).
> 
> How can metadata cross a stripe boundary?

Like this inside a RAID0 bg:

0	32K	64K	96K	128K
|             |//|//|           |

There is an old chunk allocator behavior that we can have a chunk starts 
at some physical bytenr which is only 4K aligned, but not yet STRIPE_LEN 
aligned.

In that case, extent allocator can give us some range which crosses 
stripe boundary.

That's also why we have metadata crossing stripe boundary checks in 
btrfs check and scrub.

Thanks,
Qu
Christoph Hellwig March 10, 2023, 8:03 a.m. UTC | #5
On Fri, Mar 10, 2023 at 04:02:02PM +0800, Qu Wenruo wrote:
> Like this inside a RAID0 bg:
>
> 0	32K	64K	96K	128K
> |             |//|//|           |
>
> There is an old chunk allocator behavior that we can have a chunk starts at 
> some physical bytenr which is only 4K aligned, but not yet STRIPE_LEN 
> aligned.
>
> In that case, extent allocator can give us some range which crosses stripe 
> boundary.

Ewww, ok.  So the metadata isn't required to be naturally aligned.
Qu Wenruo March 10, 2023, 8:07 a.m. UTC | #6
On 2023/3/10 16:03, Christoph Hellwig wrote:
> On Fri, Mar 10, 2023 at 04:02:02PM +0800, Qu Wenruo wrote:
>> Like this inside a RAID0 bg:
>>
>> 0	32K	64K	96K	128K
>> |             |//|//|           |
>>
>> There is an old chunk allocator behavior that we can have a chunk starts at
>> some physical bytenr which is only 4K aligned, but not yet STRIPE_LEN
>> aligned.
>>
>> In that case, extent allocator can give us some range which crosses stripe
>> boundary.
> 
> Ewww, ok.  So the metadata isn't required to be naturally aligned.

Yes, but you don't need to spend too much time on that.
We haven't hit such case for a long long time.

Unless the fs is super old and never balanced by any currently supported 
LTS kernel, it should be very rare to hit.

Thanks,
Qu
Christoph Hellwig March 10, 2023, 8:15 a.m. UTC | #7
On Fri, Mar 10, 2023 at 04:07:42PM +0800, Qu Wenruo wrote:
> Yes, but you don't need to spend too much time on that.
> We haven't hit such case for a long long time.
>
> Unless the fs is super old and never balanced by any currently supported 
> LTS kernel, it should be very rare to hit.

Well, if it is a valid format we'll need to handle it.  And we probably
want a test case to exercise the code path to make sure it doesn't break
when it is so rarely exercised.
Qu Wenruo March 10, 2023, 9:14 a.m. UTC | #8
On 2023/3/10 16:15, Christoph Hellwig wrote:
> On Fri, Mar 10, 2023 at 04:07:42PM +0800, Qu Wenruo wrote:
>> Yes, but you don't need to spend too much time on that.
>> We haven't hit such case for a long long time.
>>
>> Unless the fs is super old and never balanced by any currently supported
>> LTS kernel, it should be very rare to hit.
> 
> Well, if it is a valid format we'll need to handle it.  And we probably
> want a test case to exercise the code path to make sure it doesn't break
> when it is so rarely exercised.

Then we're already in a big trouble:

- Fstests doesn't accept binary dump

- We don't have any good way to create such slightly unaligned chunks
   Older progs may not even compile, and any currently supported LTS
   kernel won't create such chunk at all...

Thanks,
Qu
Filipe Manana March 10, 2023, 10:54 a.m. UTC | #9
On Fri, Mar 10, 2023 at 8:02 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Mar 10, 2023 at 03:42:04PM +0800, Qu Wenruo wrote:
> > This is the legacy left by older stripe boundary based bio split code.
> > (Please note that, metadata crossing stripe boundaries is not ideal and is
> > very rare nowadays, but we should still support it).
>
> How can metadata cross a stripe boundary?

Probably with mixed block groups (mkfs.btrfs -O mixed-bg) you can get
that, as block groups are used to allocate both data extents (variable
length) and metadata extents (fixed length).
Qu Wenruo March 10, 2023, 11:12 a.m. UTC | #10
On 2023/3/10 18:54, Filipe Manana wrote:
> On Fri, Mar 10, 2023 at 8:02 AM Christoph Hellwig <hch@lst.de> wrote:
>>
>> On Fri, Mar 10, 2023 at 03:42:04PM +0800, Qu Wenruo wrote:
>>> This is the legacy left by older stripe boundary based bio split code.
>>> (Please note that, metadata crossing stripe boundaries is not ideal and is
>>> very rare nowadays, but we should still support it).
>>
>> How can metadata cross a stripe boundary?
> 
> Probably with mixed block groups (mkfs.btrfs -O mixed-bg) you can get
> that, as block groups are used to allocate both data extents (variable
> length) and metadata extents (fixed length).

Mixed block groups requires nodesize to be the same as sectorsize, thus 
not possible.

Thanks,
Qu
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 26d8162bee000d..5169e73ffea647 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -98,22 +98,12 @@  void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
  */
 struct btrfs_bio_ctrl {
 	struct btrfs_bio *bbio;
-	int mirror_num;
 	enum btrfs_compression_type compress_type;
 	u32 len_to_oe_boundary;
 	blk_opf_t opf;
 	btrfs_bio_end_io_t end_io_func;
 	struct writeback_control *wbc;
 
-	/*
-	 * This is for metadata read, to provide the extra needed verification
-	 * info.  This has to be provided for submit_one_bio(), as
-	 * submit_one_bio() can submit a bio if it ends at stripe boundary.  If
-	 * no such parent_check is provided, the metadata can hit false alert at
-	 * endio time.
-	 */
-	struct btrfs_tree_parent_check *parent_check;
-
 	/*
 	 * Tell writepage not to lock the state bits for this range, it still
 	 * does the unlocking.
@@ -124,7 +114,6 @@  struct btrfs_bio_ctrl {
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_bio *bbio = bio_ctrl->bbio;
-	int mirror_num = bio_ctrl->mirror_num;
 
 	if (!bbio)
 		return;
@@ -132,25 +121,14 @@  static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 	/* Caller should ensure the bio has at least some range added */
 	ASSERT(bbio->bio.bi_iter.bi_size);
 
-	if (!is_data_inode(&bbio->inode->vfs_inode)) {
-		if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
-			/*
-			 * For metadata read, we should have the parent_check,
-			 * and copy it to bbio for metadata verification.
-			 */
-			ASSERT(bio_ctrl->parent_check);
-			memcpy(&bbio->parent_check,
-			       bio_ctrl->parent_check,
-			       sizeof(struct btrfs_tree_parent_check));
-		}
+	if (!is_data_inode(&bbio->inode->vfs_inode))
 		bbio->bio.bi_opf |= REQ_META;
-	}
 
 	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
 	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
-		btrfs_submit_compressed_read(bbio, mirror_num);
+		btrfs_submit_compressed_read(bbio, 0);
 	else
-		btrfs_submit_bio(bbio, mirror_num);
+		btrfs_submit_bio(bbio, 0);
 
 	/* The bbio is owned by the end_io handler now */
 	bio_ctrl->bbio = NULL;
@@ -4241,6 +4219,36 @@  void set_extent_buffer_uptodate(struct extent_buffer *eb)
 	}
 }
 
+static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
+				       struct btrfs_tree_parent_check *check)
+{
+	int num_pages = num_extent_pages(eb), i;
+	struct btrfs_bio *bbio;
+
+	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+	eb->read_mirror = 0;
+	atomic_set(&eb->io_pages, num_pages);
+	check_buffer_tree_ref(eb);
+
+	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
+			       REQ_OP_READ | REQ_META,
+			       BTRFS_I(eb->fs_info->btree_inode),
+			       end_bio_extent_readpage, NULL);
+	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
+	bbio->file_offset = eb->start;
+	memcpy(&bbio->parent_check, check, sizeof(*check));
+	if (eb->fs_info->nodesize < PAGE_SIZE) {
+		__bio_add_page(&bbio->bio, eb->pages[0], eb->len,
+			       eb->start - page_offset(eb->pages[0]));
+	} else {
+		for (i = 0; i < num_pages; i++) {
+			ClearPageError(eb->pages[i]);
+			__bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+		}
+	}
+	btrfs_submit_bio(bbio, mirror_num);
+}
+
 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 				      int mirror_num,
 				      struct btrfs_tree_parent_check *check)
@@ -4249,11 +4257,6 @@  static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 	struct extent_io_tree *io_tree;
 	struct page *page = eb->pages[0];
 	struct extent_state *cached_state = NULL;
-	struct btrfs_bio_ctrl bio_ctrl = {
-		.opf = REQ_OP_READ,
-		.mirror_num = mirror_num,
-		.parent_check = check,
-	};
 	int ret;
 
 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -4281,18 +4284,10 @@  static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
 		return 0;
 	}
 
-	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
-	eb->read_mirror = 0;
-	atomic_set(&eb->io_pages, 1);
-	check_buffer_tree_ref(eb);
-	bio_ctrl.end_io_func = end_bio_extent_readpage;
-
 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
-
 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
-	submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
-			   eb->start - page_offset(page));
-	submit_one_bio(&bio_ctrl);
+
+	__read_extent_buffer_pages(eb, mirror_num, check);
 	if (wait != WAIT_COMPLETE) {
 		free_extent_state(cached_state);
 		return 0;
@@ -4313,11 +4308,6 @@  int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 	int locked_pages = 0;
 	int all_uptodate = 1;
 	int num_pages;
-	struct btrfs_bio_ctrl bio_ctrl = {
-		.opf = REQ_OP_READ,
-		.mirror_num = mirror_num,
-		.parent_check = check,
-	};
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
@@ -4367,24 +4357,7 @@  int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		goto unlock_exit;
 	}
 
-	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
-	eb->read_mirror = 0;
-	atomic_set(&eb->io_pages, num_pages);
-	/*
-	 * It is possible for release_folio to clear the TREE_REF bit before we
-	 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
-	 */
-	check_buffer_tree_ref(eb);
-	bio_ctrl.end_io_func = end_bio_extent_readpage;
-	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
-
-		ClearPageError(page);
-		submit_extent_page(&bio_ctrl, page_offset(page), page,
-				   PAGE_SIZE, 0);
-	}
-
-	submit_one_bio(&bio_ctrl);
+	__read_extent_buffer_pages(eb, mirror_num, check);
 
 	if (wait != WAIT_COMPLETE)
 		return 0;