Message ID | 657d28be4aebee9d3b40e7e34b0c1b75fbbf5da6.1741591823.git.wqu@suse.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | btrfs: prepare for larger folios support | expand |
On Mon, Mar 10, 2025 at 06:06:01PM +1030, Qu Wenruo wrote: > When we're handling folios from filemap, we can no longer assume all > folios are page sized. > > Thus for call sites assuming the folio is page sized, change the > PAGE_SIZE usage to folio_size() instead. > > Signed-off-by: Qu Wenruo <wqu@suse.com> > --- > fs/btrfs/extent_io.c | 33 +++++++++++++++++++-------------- > 1 file changed, 19 insertions(+), 14 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 337d2bed98d9..337908f09b88 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -425,7 +425,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le > struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); > > ASSERT(folio_pos(folio) <= start && > - start + len <= folio_pos(folio) + PAGE_SIZE); > + start + len <= folio_pos(folio) + folio_size(folio)); > > if (uptodate && btrfs_verify_folio(folio, start, len)) > btrfs_folio_set_uptodate(fs_info, folio, start, len); > @@ -492,7 +492,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) > return; > > ASSERT(folio_test_private(folio)); > - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); > + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); > } > > /* > @@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, > { > struct btrfs_inode *inode = folio_to_inode(folio); > > - ASSERT(pg_offset + size <= PAGE_SIZE); > + ASSERT(pg_offset + size <= folio_size(folio)); > ASSERT(bio_ctrl->end_io_func); > > if (bio_ctrl->bbio && > @@ -935,7 +935,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, > struct inode *inode = folio->mapping->host; > struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); > u64 start = folio_pos(folio); > - const u64 end = start + PAGE_SIZE - 1; > + const u64 end = start + folio_size(folio) - 1; > u64 extent_offset; > u64 last_byte = i_size_read(inode); > struct extent_map *em; > @@ -1279,7 +1279,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit > unsigned int start_bit; > unsigned int nbits; > > - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); > + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); > start_bit = (start - folio_start) >> fs_info->sectorsize_bits; > nbits = len >> fs_info->sectorsize_bits; > ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); > @@ -1297,7 +1297,7 @@ static bool find_next_delalloc_bitmap(struct folio *folio, > unsigned int first_zero; > unsigned int first_set; > > - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); > + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); > > start_bit = (start - folio_start) >> fs_info->sectorsize_bits; > first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); > @@ -1499,10 +1499,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, > delalloc_end = page_end; > /* > * delalloc_end is already one less than the total length, so > - * we don't subtract one from PAGE_SIZE > + * we don't subtract one from folio_size(). > */ > delalloc_to_write += > - DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); > + DIV_ROUND_UP(delalloc_end + 1 - page_start, folio_size(folio)); > > /* > * If all ranges are submitted asynchronously, we just need to account > @@ -1765,7 +1765,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl > goto done; > > ret = extent_writepage_io(inode, folio, folio_pos(folio), > - PAGE_SIZE, bio_ctrl, i_size); > + folio_size(folio), bio_ctrl, i_size); > if (ret == 1) > return 0; > if (ret < 0) > @@ -2492,8 +2492,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f > ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); > > while (cur <= end) { > - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); > - u32 cur_len = cur_end + 1 - cur; > + u64 cur_end; > + u32 cur_len; > struct folio *folio; > > folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); > @@ -2503,13 +2503,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f > * code is just in case, but shouldn't actually be run. > */ > if (IS_ERR(folio)) { > + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); > + cur_len = cur_end + 1 - cur; Curious: is this guess based on PAGE_SIZE more correct than using 1 as num_bytes? How much more correct? :) Probably better question: what is the behavior for the range [PAGE_SIZE, folio_size(folio)] should that filemap_get_folio have returned properly? If it truly can never happen AND we can't handle it correctly, is handling it "sort of correctly" a good idea? > btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, > cur, cur_len, false); > mapping_set_error(mapping, PTR_ERR(folio)); > - cur = cur_end + 1; > + cur = cur_end; > continue; > } > > + cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end); > + cur_len = cur_end + 1 - cur; > + > ASSERT(folio_test_locked(folio)); > if (pages_dirty && folio != locked_folio) > ASSERT(folio_test_dirty(folio)); > @@ -2621,7 +2626,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, > struct folio *folio) > { > u64 start = folio_pos(folio); > - u64 end = start + PAGE_SIZE - 1; > + u64 end = start + folio_size(folio) - 1; > bool ret; > > if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { > @@ -2659,7 +2664,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, > bool try_release_extent_mapping(struct folio *folio, gfp_t mask) > { > u64 start = folio_pos(folio); > - u64 end = start + PAGE_SIZE - 1; > + u64 end = start + folio_size(folio) - 1; > struct btrfs_inode *inode = folio_to_inode(folio); > struct extent_io_tree *io_tree = &inode->io_tree; > > -- > 2.48.1 >
在 2025/3/11 09:57, Boris Burkov 写道: [...] >> @@ -2503,13 +2503,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f >> * code is just in case, but shouldn't actually be run. >> */ >> if (IS_ERR(folio)) { >> + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); >> + cur_len = cur_end + 1 - cur; > > Curious: is this guess based on PAGE_SIZE more correct than using 1 as > num_bytes? How much more correct? :) Filemap (page cache) itself is mostly an xarray using page index. That's why filemap_get_folio() only accepts a page index, not a bytenr. Using length 1 will just waste a lot of CPU (PAGE_SIZE times) trying to get the same missing folio at the same index. > > Probably better question: what is the behavior for the range > [PAGE_SIZE, folio_size(folio)] should that filemap_get_folio have > returned properly? Depends on the range of extent_write_locked_range(). If the range covers [PAGE_SIZE, folio_size(folio)], we just submit the write in one go, thus improving the performance. > If it truly can never happen AND we can't handle > it correctly, is handling it "sort of correctly" a good idea? So far I think it should not happen, but this is only called by the compressed write path when compression failed, which is not a super hot path. Thus I tend to keep it as-is for now. The future plan is to change how we submit compressed write. Instead of the more-or-less cursed async extent (just check how many rabbits I have to pull out of the hat for subpage block-perfect compression), we will submit the write no different than any other writes. And the real compression is handled with one extra layer (like RAID56, but not exactly the same) on that bbio. Then we can get rid of the extent_write_locked_range() completely, thus no more such weird handling. Thanks, Qu > >> btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, >> cur, cur_len, false); >> mapping_set_error(mapping, PTR_ERR(folio)); >> - cur = cur_end + 1; >> + cur = cur_end; >> continue; >> } >> >> + cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end); >> + cur_len = cur_end + 1 - cur; >> + >> ASSERT(folio_test_locked(folio)); >> if (pages_dirty && folio != locked_folio) >> ASSERT(folio_test_dirty(folio)); >> @@ -2621,7 +2626,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, >> struct folio *folio) >> { >> u64 start = folio_pos(folio); >> - u64 end = start + PAGE_SIZE - 1; >> + u64 end = start + folio_size(folio) - 1; >> bool ret; >> >> if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { >> @@ -2659,7 +2664,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, >> bool try_release_extent_mapping(struct folio *folio, gfp_t mask) >> { >> u64 start = folio_pos(folio); >> - u64 end = start + PAGE_SIZE - 1; >> + u64 end = start + folio_size(folio) - 1; >> struct btrfs_inode *inode = folio_to_inode(folio); >> struct extent_io_tree *io_tree = &inode->io_tree; >> >> -- >> 2.48.1 >>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 337d2bed98d9..337908f09b88 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -425,7 +425,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -492,7 +492,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, { struct btrfs_inode *inode = folio_to_inode(folio); - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && @@ -935,7 +935,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); struct extent_map *em; @@ -1279,7 +1279,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1297,7 +1297,7 @@ static bool find_next_delalloc_bitmap(struct folio *folio, unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1499,10 +1499,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from folio_size(). */ delalloc_to_write += - DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + DIV_ROUND_UP(delalloc_end + 1 - page_start, folio_size(folio)); /* * If all ranges are submitted asynchronously, we just need to account @@ -1765,7 +1765,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) @@ -2492,8 +2492,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2503,13 +2503,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min(folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2621,7 +2626,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { @@ -2659,7 +2664,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree;
When we're handling folios from filemap, we can no longer assume all folios are page sized. Thus for call sites assuming the folio is page sized, change the PAGE_SIZE usage to folio_size() instead. Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-)