diff mbox series

[5/6] btrfs: prepare extent_io.c for future larger folio support

Message ID 657d28be4aebee9d3b40e7e34b0c1b75fbbf5da6.1741591823.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: prepare for larger folios support | expand

Commit Message

Qu Wenruo March 10, 2025, 7:36 a.m. UTC
When we're handling folios from filemap, we can no longer assume all
folios are page sized.

Thus for call sites assuming the folio is page sized, change the
PAGE_SIZE usage to folio_size() instead.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_io.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Comments

Boris Burkov March 10, 2025, 11:27 p.m. UTC | #1
On Mon, Mar 10, 2025 at 06:06:01PM +1030, Qu Wenruo wrote:
> When we're handling folios from filemap, we can no longer assume all
> folios are page sized.
> 
> Thus for call sites assuming the folio is page sized, change the
> PAGE_SIZE usage to folio_size() instead.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent_io.c | 33 +++++++++++++++++++--------------
>  1 file changed, 19 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 337d2bed98d9..337908f09b88 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -425,7 +425,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
>  	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
>  
>  	ASSERT(folio_pos(folio) <= start &&
> -	       start + len <= folio_pos(folio) + PAGE_SIZE);
> +	       start + len <= folio_pos(folio) + folio_size(folio));
>  
>  	if (uptodate && btrfs_verify_folio(folio, start, len))
>  		btrfs_folio_set_uptodate(fs_info, folio, start, len);
> @@ -492,7 +492,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
>  		return;
>  
>  	ASSERT(folio_test_private(folio));
> -	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
> +	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
>  }
>  
>  /*
> @@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
>  {
>  	struct btrfs_inode *inode = folio_to_inode(folio);
>  
> -	ASSERT(pg_offset + size <= PAGE_SIZE);
> +	ASSERT(pg_offset + size <= folio_size(folio));
>  	ASSERT(bio_ctrl->end_io_func);
>  
>  	if (bio_ctrl->bbio &&
> @@ -935,7 +935,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
>  	struct inode *inode = folio->mapping->host;
>  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
>  	u64 start = folio_pos(folio);
> -	const u64 end = start + PAGE_SIZE - 1;
> +	const u64 end = start + folio_size(folio) - 1;
>  	u64 extent_offset;
>  	u64 last_byte = i_size_read(inode);
>  	struct extent_map *em;
> @@ -1279,7 +1279,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
>  	unsigned int start_bit;
>  	unsigned int nbits;
>  
> -	ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
> +	ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
>  	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
>  	nbits = len >> fs_info->sectorsize_bits;
>  	ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
> @@ -1297,7 +1297,7 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
>  	unsigned int first_zero;
>  	unsigned int first_set;
>  
> -	ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
> +	ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
>  
>  	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
>  	first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
> @@ -1499,10 +1499,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
>  		delalloc_end = page_end;
>  	/*
>  	 * delalloc_end is already one less than the total length, so
> -	 * we don't subtract one from PAGE_SIZE
> +	 * we don't subtract one from folio_size().
>  	 */
>  	delalloc_to_write +=
> -		DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
> +		DIV_ROUND_UP(delalloc_end + 1 - page_start, folio_size(folio));
>  
>  	/*
>  	 * If all ranges are submitted asynchronously, we just need to account
> @@ -1765,7 +1765,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
>  		goto done;
>  
>  	ret = extent_writepage_io(inode, folio, folio_pos(folio),
> -				  PAGE_SIZE, bio_ctrl, i_size);
> +				  folio_size(folio), bio_ctrl, i_size);
>  	if (ret == 1)
>  		return 0;
>  	if (ret < 0)
> @@ -2492,8 +2492,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
>  	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
>  
>  	while (cur <= end) {
> -		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
> -		u32 cur_len = cur_end + 1 - cur;
> +		u64 cur_end;
> +		u32 cur_len;
>  		struct folio *folio;
>  
>  		folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
> @@ -2503,13 +2503,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
>  		 * code is just in case, but shouldn't actually be run.
>  		 */
>  		if (IS_ERR(folio)) {
> +			cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
> +			cur_len = cur_end + 1 - cur;

Curious: is this guess based on PAGE_SIZE more correct than using 1 as
num_bytes? How much more correct? :)

Probably better question: what is the behavior for the range
[PAGE_SIZE, folio_size(folio)] should that filemap_get_folio have
returned properly? If it truly can never happen AND we can't handle
it correctly, is handling it "sort of correctly" a good idea?

>  			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
>  						       cur, cur_len, false);
>  			mapping_set_error(mapping, PTR_ERR(folio));
> -			cur = cur_end + 1;
> +			cur = cur_end;
>  			continue;
>  		}
>  
> +		cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end);
> +		cur_len = cur_end + 1 - cur;
> +
>  		ASSERT(folio_test_locked(folio));
>  		if (pages_dirty && folio != locked_folio)
>  			ASSERT(folio_test_dirty(folio));
> @@ -2621,7 +2626,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
>  				     struct folio *folio)
>  {
>  	u64 start = folio_pos(folio);
> -	u64 end = start + PAGE_SIZE - 1;
> +	u64 end = start + folio_size(folio) - 1;
>  	bool ret;
>  
>  	if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
> @@ -2659,7 +2664,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
>  bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
>  {
>  	u64 start = folio_pos(folio);
> -	u64 end = start + PAGE_SIZE - 1;
> +	u64 end = start + folio_size(folio) - 1;
>  	struct btrfs_inode *inode = folio_to_inode(folio);
>  	struct extent_io_tree *io_tree = &inode->io_tree;
>  
> -- 
> 2.48.1
>
Qu Wenruo March 10, 2025, 11:51 p.m. UTC | #2
在 2025/3/11 09:57, Boris Burkov 写道:
[...]
>> @@ -2503,13 +2503,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
>>   		 * code is just in case, but shouldn't actually be run.
>>   		 */
>>   		if (IS_ERR(folio)) {
>> +			cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
>> +			cur_len = cur_end + 1 - cur;
> 
> Curious: is this guess based on PAGE_SIZE more correct than using 1 as
> num_bytes? How much more correct? :)

Filemap (page cache) itself is mostly an xarray using page index.

That's why filemap_get_folio() only accepts a page index, not a bytenr.

Using length 1 will just waste a lot of CPU (PAGE_SIZE times) trying to 
get the same missing folio at the same index.

> 
> Probably better question: what is the behavior for the range
> [PAGE_SIZE, folio_size(folio)] should that filemap_get_folio have
> returned properly?

Depends on the range of extent_write_locked_range().

If the range covers [PAGE_SIZE, folio_size(folio)], we just submit the 
write in one go, thus improving the performance.


> If it truly can never happen AND we can't handle
> it correctly, is handling it "sort of correctly" a good idea?

So far I think it should not happen, but this is only called by the 
compressed write path when compression failed, which is not a super hot 
path.
Thus I tend to keep it as-is for now.

The future plan is to change how we submit compressed write.
Instead of the more-or-less cursed async extent (just check how many 
rabbits I have to pull out of the hat for subpage block-perfect 
compression), we will submit the write no different than any other writes.

And the real compression is handled with one extra layer (like RAID56, 
but not exactly the same) on that bbio.

Then we can get rid of the extent_write_locked_range() completely, thus 
no more such weird handling.

Thanks,
Qu

> 
>>   			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
>>   						       cur, cur_len, false);
>>   			mapping_set_error(mapping, PTR_ERR(folio));
>> -			cur = cur_end + 1;
>> +			cur = cur_end;
>>   			continue;
>>   		}
>>   
>> +		cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end);
>> +		cur_len = cur_end + 1 - cur;
>> +
>>   		ASSERT(folio_test_locked(folio));
>>   		if (pages_dirty && folio != locked_folio)
>>   			ASSERT(folio_test_dirty(folio));
>> @@ -2621,7 +2626,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
>>   				     struct folio *folio)
>>   {
>>   	u64 start = folio_pos(folio);
>> -	u64 end = start + PAGE_SIZE - 1;
>> +	u64 end = start + folio_size(folio) - 1;
>>   	bool ret;
>>   
>>   	if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
>> @@ -2659,7 +2664,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
>>   bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
>>   {
>>   	u64 start = folio_pos(folio);
>> -	u64 end = start + PAGE_SIZE - 1;
>> +	u64 end = start + folio_size(folio) - 1;
>>   	struct btrfs_inode *inode = folio_to_inode(folio);
>>   	struct extent_io_tree *io_tree = &inode->io_tree;
>>   
>> -- 
>> 2.48.1
>>
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 337d2bed98d9..337908f09b88 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -425,7 +425,7 @@  static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
 
 	ASSERT(folio_pos(folio) <= start &&
-	       start + len <= folio_pos(folio) + PAGE_SIZE);
+	       start + len <= folio_pos(folio) + folio_size(folio));
 
 	if (uptodate && btrfs_verify_folio(folio, start, len))
 		btrfs_folio_set_uptodate(fs_info, folio, start, len);
@@ -492,7 +492,7 @@  static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
 		return;
 
 	ASSERT(folio_test_private(folio));
-	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
 }
 
 /*
@@ -753,7 +753,7 @@  static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 {
 	struct btrfs_inode *inode = folio_to_inode(folio);
 
-	ASSERT(pg_offset + size <= PAGE_SIZE);
+	ASSERT(pg_offset + size <= folio_size(folio));
 	ASSERT(bio_ctrl->end_io_func);
 
 	if (bio_ctrl->bbio &&
@@ -935,7 +935,7 @@  static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 	struct inode *inode = folio->mapping->host;
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	u64 start = folio_pos(folio);
-	const u64 end = start + PAGE_SIZE - 1;
+	const u64 end = start + folio_size(folio) - 1;
 	u64 extent_offset;
 	u64 last_byte = i_size_read(inode);
 	struct extent_map *em;
@@ -1279,7 +1279,7 @@  static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
 	unsigned int start_bit;
 	unsigned int nbits;
 
-	ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+	ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
 	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
 	nbits = len >> fs_info->sectorsize_bits;
 	ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1297,7 +1297,7 @@  static bool find_next_delalloc_bitmap(struct folio *folio,
 	unsigned int first_zero;
 	unsigned int first_set;
 
-	ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+	ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
 
 	start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
 	first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1499,10 +1499,10 @@  static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 		delalloc_end = page_end;
 	/*
 	 * delalloc_end is already one less than the total length, so
-	 * we don't subtract one from PAGE_SIZE
+	 * we don't subtract one from folio_size().
 	 */
 	delalloc_to_write +=
-		DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
+		DIV_ROUND_UP(delalloc_end + 1 - page_start, folio_size(folio));
 
 	/*
 	 * If all ranges are submitted asynchronously, we just need to account
@@ -1765,7 +1765,7 @@  static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
 		goto done;
 
 	ret = extent_writepage_io(inode, folio, folio_pos(folio),
-				  PAGE_SIZE, bio_ctrl, i_size);
+				  folio_size(folio), bio_ctrl, i_size);
 	if (ret == 1)
 		return 0;
 	if (ret < 0)
@@ -2492,8 +2492,8 @@  void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
 
 	while (cur <= end) {
-		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
-		u32 cur_len = cur_end + 1 - cur;
+		u64 cur_end;
+		u32 cur_len;
 		struct folio *folio;
 
 		folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2503,13 +2503,18 @@  void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
 		 * code is just in case, but shouldn't actually be run.
 		 */
 		if (IS_ERR(folio)) {
+			cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+			cur_len = cur_end + 1 - cur;
 			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
 						       cur, cur_len, false);
 			mapping_set_error(mapping, PTR_ERR(folio));
-			cur = cur_end + 1;
+			cur = cur_end;
 			continue;
 		}
 
+		cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end);
+		cur_len = cur_end + 1 - cur;
+
 		ASSERT(folio_test_locked(folio));
 		if (pages_dirty && folio != locked_folio)
 			ASSERT(folio_test_dirty(folio));
@@ -2621,7 +2626,7 @@  static bool try_release_extent_state(struct extent_io_tree *tree,
 				     struct folio *folio)
 {
 	u64 start = folio_pos(folio);
-	u64 end = start + PAGE_SIZE - 1;
+	u64 end = start + folio_size(folio) - 1;
 	bool ret;
 
 	if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
@@ -2659,7 +2664,7 @@  static bool try_release_extent_state(struct extent_io_tree *tree,
 bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
 {
 	u64 start = folio_pos(folio);
-	u64 end = start + PAGE_SIZE - 1;
+	u64 end = start + folio_size(folio) - 1;
 	struct btrfs_inode *inode = folio_to_inode(folio);
 	struct extent_io_tree *io_tree = &inode->io_tree;