diff mbox series

btrfs: refactor __extent_writepage_io() to do sector-by-sector submission

Message ID 9102c028537fbc1d94c4b092dd4a9940661bc58b.1723020573.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: refactor __extent_writepage_io() to do sector-by-sector submission | expand

Commit Message

Qu Wenruo Aug. 7, 2024, 8:51 a.m. UTC
Unlike the bitmap usage inside raid56, for __extent_writepage_io() we
handle the subpage submission not sector-by-sector, but for each dirty
range we found.

This is not a big deal normally, as normally the subpage complex code is
already mostly optimized out.

For the sake of consistency and for the future of subpage sector-perfect
compression support, this patch does:

- Extract the sector submission code into submit_one_sector()

- Add the needed code to extract the dirty bitmap for subpage case

- Use bitmap_and() to calculate the target sectors we need to submit

For x86_64, the dirty bitmap will be fixed to 1, with the length of 1,
so we're still doing the same workload per sector.

For larger page sizes, the overhead will be a little larger, as previous
we only need to do one extent_map lookup per-dirty-range, but now it
will be one extent_map lookup per-sector.

But that is the same frequency as x86_64, so we're just aligning the
behavior to x86_64.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
The plan of sector-perfect subpage compression is to introduce another
bitmap, possibly inside bio_ctrl, to indicate which sectors do not need
writeback submission (either inline, or async submission).

So that __extent_writepage_io() can properly skip those ranges to
support sector-perfect subpage compression.
---
 fs/btrfs/extent_io.c | 188 +++++++++++++++++--------------------------
 fs/btrfs/subpage.c   |  17 ++++
 fs/btrfs/subpage.h   |   3 +
 3 files changed, 96 insertions(+), 112 deletions(-)

Comments

Josef Bacik Aug. 7, 2024, 2:18 p.m. UTC | #1
On Wed, Aug 07, 2024 at 06:21:00PM +0930, Qu Wenruo wrote:
> Unlike the bitmap usage inside raid56, for __extent_writepage_io() we
> handle the subpage submission not sector-by-sector, but for each dirty
> range we found.
> 
> This is not a big deal normally, as normally the subpage complex code is
> already mostly optimized out.
> 
> For the sake of consistency and for the future of subpage sector-perfect
> compression support, this patch does:
> 
> - Extract the sector submission code into submit_one_sector()
> 
> - Add the needed code to extract the dirty bitmap for subpage case
> 
> - Use bitmap_and() to calculate the target sectors we need to submit
> 
> For x86_64, the dirty bitmap will be fixed to 1, with the length of 1,
> so we're still doing the same workload per sector.
> 
> For larger page sizes, the overhead will be a little larger, as previous
> we only need to do one extent_map lookup per-dirty-range, but now it
> will be one extent_map lookup per-sector.

I'd like this to be a followup, because even in the normal page case (or hell
the subpage case) we shouldn't have to look up the extent map over and over
again, it could span a much larger area.

> 
> But that is the same frequency as x86_64, so we're just aligning the
> behavior to x86_64.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> The plan of sector-perfect subpage compression is to introduce another
> bitmap, possibly inside bio_ctrl, to indicate which sectors do not need
> writeback submission (either inline, or async submission).
> 
> So that __extent_writepage_io() can properly skip those ranges to
> support sector-perfect subpage compression.
> ---
>  fs/btrfs/extent_io.c | 188 +++++++++++++++++--------------------------
>  fs/btrfs/subpage.c   |  17 ++++
>  fs/btrfs/subpage.h   |   3 +
>  3 files changed, 96 insertions(+), 112 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 040c92541bc9..6acd763e8f26 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1333,56 +1333,65 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
>  }
>  
>  /*
> - * Find the first byte we need to write.
> + * Return 0 if we have submitted or queued the sector for submission.
> + * Return <0 for critical errors.
>   *
> - * For subpage, one page can contain several sectors, and
> - * __extent_writepage_io() will just grab all extent maps in the page
> - * range and try to submit all non-inline/non-compressed extents.
> - *
> - * This is a big problem for subpage, we shouldn't re-submit already written
> - * data at all.
> - * This function will lookup subpage dirty bit to find which range we really
> - * need to submit.
> - *
> - * Return the next dirty range in [@start, @end).
> - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
> + * Caller should make sure filepos < i_size and handle filepos >= i_size case.
>   */
> -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
> -				 struct folio *folio, u64 *start, u64 *end)
> +static int submit_one_sector(struct btrfs_inode *inode,
> +			     struct folio *folio,
> +			     u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
> +			     loff_t i_size)
>  {
> -	struct btrfs_subpage *subpage = folio_get_private(folio);
> -	struct btrfs_subpage_info *spi = fs_info->subpage_info;
> -	u64 orig_start = *start;
> -	/* Declare as unsigned long so we can use bitmap ops */
> -	unsigned long flags;
> -	int range_start_bit;
> -	int range_end_bit;
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_map *em;
> +	u64 block_start;
> +	u64 disk_bytenr;
> +	u64 extent_offset;
> +	u64 em_end;
> +	u32 sectorsize = fs_info->sectorsize;
>  
> -	/*
> -	 * For regular sector size == page size case, since one page only
> -	 * contains one sector, we return the page offset directly.
> -	 */
> -	if (!btrfs_is_subpage(fs_info, folio->mapping)) {
> -		*start = folio_pos(folio);
> -		*end = folio_pos(folio) + folio_size(folio);
> -		return;
> +	ASSERT(IS_ALIGNED(filepos, sectorsize));
> +
> +	/* @filepos >= i_size case should be handled by the caller. */
> +	ASSERT(filepos < i_size);
> +
> +	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
> +	if (IS_ERR(em))
> +		return PTR_ERR_OR_ZERO(em);
> +
> +	extent_offset = filepos - em->start;
> +	em_end = extent_map_end(em);
> +	ASSERT(filepos <= em_end);
> +	ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
> +	ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
> +
> +	block_start = extent_map_block_start(em);
> +	disk_bytenr = extent_map_block_start(em) + extent_offset;
> +
> +	ASSERT(!extent_map_is_compressed(em));
> +	ASSERT(block_start != EXTENT_MAP_HOLE);
> +	ASSERT(block_start != EXTENT_MAP_INLINE);
> +
> +	free_extent_map(em);
> +	em = NULL;
> +
> +	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
> +	if (!folio_test_writeback(folio)) {
> +		btrfs_err(fs_info,
> +			  "folio %lu not writeback, cur %llu end %llu",
> +			  folio->index, filepos, filepos + sectorsize);
>  	}
> -
> -	range_start_bit = spi->dirty_offset +
> -			  (offset_in_folio(folio, orig_start) >>
> -			   fs_info->sectorsize_bits);
> -
> -	/* We should have the page locked, but just in case */
> -	spin_lock_irqsave(&subpage->lock, flags);
> -	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
> -			       spi->dirty_offset + spi->bitmap_nr_bits);
> -	spin_unlock_irqrestore(&subpage->lock, flags);
> -
> -	range_start_bit -= spi->dirty_offset;
> -	range_end_bit -= spi->dirty_offset;
> -
> -	*start = folio_pos(folio) + range_start_bit * fs_info->sectorsize;
> -	*end = folio_pos(folio) + range_end_bit * fs_info->sectorsize;
> +	/*
> +	 * Although the PageDirty bit is cleared before entering this
> +	 * function, subpage dirty bit is not cleared.
> +	 * So clear subpage dirty bit here so next time we won't submit
> +	 * folio for range already written to disk.
> +	 */
> +	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
> +	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
> +			    sectorsize, filepos - folio_pos(folio));
> +	return 0;
>  }
>  
>  /*
> @@ -1401,10 +1410,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
>  {
>  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
>  	u64 cur = start;
> -	u64 end = start + len - 1;
> -	u64 extent_offset;
> -	u64 block_start;
> -	struct extent_map *em;
> +	unsigned long dirty_bitmap = 1;
> +	unsigned long range_bitmap = 0;
> +	unsigned int bitmap_size = 1;
> +	int bit;
>  	int ret = 0;
>  	int nr = 0;
>  
> @@ -1419,18 +1428,23 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
>  		return 1;
>  	}
>  
> +	if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
> +		ASSERT(fs_info->subpage_info);
> +		btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap);
> +		bitmap_size = fs_info->subpage_info->bitmap_nr_bits;
> +	}
> +	for (cur = start; cur < start + len; cur += fs_info->sectorsize)
> +		set_bit((cur - start) >> fs_info->sectorsize_bits, &range_bitmap);
> +	bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size);

I'd rather move this completely under the btrfs_is_subpage() case, we don't need
to do this where sectorsize == page size.

Other than that this cleans things up nicely, you can add my

Reviewed-by: Josef Bacik <josef@toxicpanda.com>

to v2.  Thanks,

Josef
David Sterba Aug. 7, 2024, 4:27 p.m. UTC | #2
On Wed, Aug 07, 2024 at 06:21:00PM +0930, Qu Wenruo wrote:
> Unlike the bitmap usage inside raid56, for __extent_writepage_io() we
> handle the subpage submission not sector-by-sector, but for each dirty
> range we found.
> 
> This is not a big deal normally, as normally the subpage complex code is
> already mostly optimized out.
> 
> For the sake of consistency and for the future of subpage sector-perfect
> compression support, this patch does:
> 
> - Extract the sector submission code into submit_one_sector()
> 
> - Add the needed code to extract the dirty bitmap for subpage case
> 
> - Use bitmap_and() to calculate the target sectors we need to submit
> 
> For x86_64, the dirty bitmap will be fixed to 1, with the length of 1,
> so we're still doing the same workload per sector.
> 
> For larger page sizes, the overhead will be a little larger, as previous
> we only need to do one extent_map lookup per-dirty-range, but now it
> will be one extent_map lookup per-sector.
> 
> But that is the same frequency as x86_64, so we're just aligning the
> behavior to x86_64.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> The plan of sector-perfect subpage compression is to introduce another
> bitmap, possibly inside bio_ctrl, to indicate which sectors do not need
> writeback submission (either inline, or async submission).
> 
> So that __extent_writepage_io() can properly skip those ranges to
> support sector-perfect subpage compression.
> ---
>  fs/btrfs/extent_io.c | 188 +++++++++++++++++--------------------------
>  fs/btrfs/subpage.c   |  17 ++++
>  fs/btrfs/subpage.h   |   3 +
>  3 files changed, 96 insertions(+), 112 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 040c92541bc9..6acd763e8f26 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1333,56 +1333,65 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
>  }
>  
>  /*
> - * Find the first byte we need to write.
> + * Return 0 if we have submitted or queued the sector for submission.
> + * Return <0 for critical errors.
>   *
> - * For subpage, one page can contain several sectors, and
> - * __extent_writepage_io() will just grab all extent maps in the page
> - * range and try to submit all non-inline/non-compressed extents.
> - *
> - * This is a big problem for subpage, we shouldn't re-submit already written
> - * data at all.
> - * This function will lookup subpage dirty bit to find which range we really
> - * need to submit.
> - *
> - * Return the next dirty range in [@start, @end).
> - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
> + * Caller should make sure filepos < i_size and handle filepos >= i_size case.
>   */
> -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
> -				 struct folio *folio, u64 *start, u64 *end)
> +static int submit_one_sector(struct btrfs_inode *inode,
> +			     struct folio *folio,
> +			     u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
> +			     loff_t i_size)
>  {
> -	struct btrfs_subpage *subpage = folio_get_private(folio);
> -	struct btrfs_subpage_info *spi = fs_info->subpage_info;
> -	u64 orig_start = *start;
> -	/* Declare as unsigned long so we can use bitmap ops */
> -	unsigned long flags;
> -	int range_start_bit;
> -	int range_end_bit;
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct extent_map *em;
> +	u64 block_start;
> +	u64 disk_bytenr;
> +	u64 extent_offset;
> +	u64 em_end;
> +	u32 sectorsize = fs_info->sectorsize;

This can be const

>  
> -	/*
> -	 * For regular sector size == page size case, since one page only
> -	 * contains one sector, we return the page offset directly.
> -	 */
> -	if (!btrfs_is_subpage(fs_info, folio->mapping)) {
> -		*start = folio_pos(folio);
> -		*end = folio_pos(folio) + folio_size(folio);
> -		return;
> +	ASSERT(IS_ALIGNED(filepos, sectorsize));
> +
> +	/* @filepos >= i_size case should be handled by the caller. */
> +	ASSERT(filepos < i_size);
> +
> +	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
> +	if (IS_ERR(em))
> +		return PTR_ERR_OR_ZERO(em);
> +
> +	extent_offset = filepos - em->start;
> +	em_end = extent_map_end(em);
> +	ASSERT(filepos <= em_end);
> +	ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
> +	ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));

And the local sectorsize used here

> +
> +	block_start = extent_map_block_start(em);
> +	disk_bytenr = extent_map_block_start(em) + extent_offset;
> +
> +	ASSERT(!extent_map_is_compressed(em));
> +	ASSERT(block_start != EXTENT_MAP_HOLE);
> +	ASSERT(block_start != EXTENT_MAP_INLINE);
> +
> +	free_extent_map(em);
> +	em = NULL;
> +
> +	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
> +	if (!folio_test_writeback(folio)) {
> +		btrfs_err(fs_info,
> +			  "folio %lu not writeback, cur %llu end %llu",
> +			  folio->index, filepos, filepos + sectorsize);

This is copied from the original code but I wonder if this should be a
hard error or more visible as it looks like bad page state tracking.


>  	}
> -
> -	range_start_bit = spi->dirty_offset +
> -			  (offset_in_folio(folio, orig_start) >>
> -			   fs_info->sectorsize_bits);
> -
> -	/* We should have the page locked, but just in case */
> -	spin_lock_irqsave(&subpage->lock, flags);
> -	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
> -			       spi->dirty_offset + spi->bitmap_nr_bits);
> -	spin_unlock_irqrestore(&subpage->lock, flags);
> -
> -	range_start_bit -= spi->dirty_offset;
> -	range_end_bit -= spi->dirty_offset;
> -
> -	*start = folio_pos(folio) + range_start_bit * fs_info->sectorsize;
> -	*end = folio_pos(folio) + range_end_bit * fs_info->sectorsize;
> +	/*
> +	 * Although the PageDirty bit is cleared before entering this
> +	 * function, subpage dirty bit is not cleared.
> +	 * So clear subpage dirty bit here so next time we won't submit
> +	 * folio for range already written to disk.
> +	 */
> +	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
> +	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
> +			    sectorsize, filepos - folio_pos(folio));
> +	return 0;
>  }
>  
>  /*
Qu Wenruo Aug. 7, 2024, 10:01 p.m. UTC | #3
在 2024/8/7 23:48, Josef Bacik 写道:
> On Wed, Aug 07, 2024 at 06:21:00PM +0930, Qu Wenruo wrote:
>> Unlike the bitmap usage inside raid56, for __extent_writepage_io() we
>> handle the subpage submission not sector-by-sector, but for each dirty
>> range we found.
>>
>> This is not a big deal normally, as normally the subpage complex code is
>> already mostly optimized out.
>>
>> For the sake of consistency and for the future of subpage sector-perfect
>> compression support, this patch does:
>>
>> - Extract the sector submission code into submit_one_sector()
>>
>> - Add the needed code to extract the dirty bitmap for subpage case
>>
>> - Use bitmap_and() to calculate the target sectors we need to submit
>>
>> For x86_64, the dirty bitmap will be fixed to 1, with the length of 1,
>> so we're still doing the same workload per sector.
>>
>> For larger page sizes, the overhead will be a little larger, as previous
>> we only need to do one extent_map lookup per-dirty-range, but now it
>> will be one extent_map lookup per-sector.
>
> I'd like this to be a followup, because even in the normal page case (or hell
> the subpage case) we shouldn't have to look up the extent map over and over
> again, it could span a much larger area.

Yep, that's also something I want to improve.

My guess is, using a cached extent_map can improve the situation?

But that will be another patch, purely for performance improvement.

[...]
>>
>> +	if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
>> +		ASSERT(fs_info->subpage_info);
>> +		btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap);
>> +		bitmap_size = fs_info->subpage_info->bitmap_nr_bits;
>> +	}
>> +	for (cur = start; cur < start + len; cur += fs_info->sectorsize)
>> +		set_bit((cur - start) >> fs_info->sectorsize_bits, &range_bitmap);
>> +	bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size);
>
> I'd rather move this completely under the btrfs_is_subpage() case, we don't need
> to do this where sectorsize == page size.

This has a tiny problem, that page dirty and subpage dirty is not in sync.

For sectorsize == PAGE_SIZE, the set_bit() is only called once, and
bitmap_and() is very cheap for single bit operation too.

The complex part is already under btrfs_is_subpage() check.

In fact the set_bit() and bitmap_and() will later be reused for
inline/compression skip, for both regular sectorsize == page size and
subpage cases.

(There will be another new bitmap introduced into bio_ctrl, to indicate
which sector(s) do not need writeback, thus ther bitmap operations will
be there for that future change)

Thanks,
Qu

>
> Other than that this cleans things up nicely, you can add my
>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
>
> to v2.  Thanks,
>
> Josef
>
Qu Wenruo Aug. 7, 2024, 10:05 p.m. UTC | #4
在 2024/8/8 01:57, David Sterba 写道:
> On Wed, Aug 07, 2024 at 06:21:00PM +0930, Qu Wenruo wrote:
[...]
>
>> +
>> +	block_start = extent_map_block_start(em);
>> +	disk_bytenr = extent_map_block_start(em) + extent_offset;
>> +
>> +	ASSERT(!extent_map_is_compressed(em));
>> +	ASSERT(block_start != EXTENT_MAP_HOLE);
>> +	ASSERT(block_start != EXTENT_MAP_INLINE);
>> +
>> +	free_extent_map(em);
>> +	em = NULL;
>> +
>> +	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
>> +	if (!folio_test_writeback(folio)) {
>> +		btrfs_err(fs_info,
>> +			  "folio %lu not writeback, cur %llu end %llu",
>> +			  folio->index, filepos, filepos + sectorsize);
>
> This is copied from the original code but I wonder if this should be a
> hard error or more visible as it looks like bad page state tracking.
>

I believe this can be removed completely, or converted into ASSERT().

For subpage case, page writeback is set if any subpage sector has
writeback flag set.

For regular case, it's setting the whole page writeback.

As long as the folio is already locked, the folio should always be
marked writeback.

So I prefer to go the ASSERT() path.

Thanks,
Qu

>
>>   	}
>> -
>> -	range_start_bit = spi->dirty_offset +
>> -			  (offset_in_folio(folio, orig_start) >>
>> -			   fs_info->sectorsize_bits);
>> -
>> -	/* We should have the page locked, but just in case */
>> -	spin_lock_irqsave(&subpage->lock, flags);
>> -	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
>> -			       spi->dirty_offset + spi->bitmap_nr_bits);
>> -	spin_unlock_irqrestore(&subpage->lock, flags);
>> -
>> -	range_start_bit -= spi->dirty_offset;
>> -	range_end_bit -= spi->dirty_offset;
>> -
>> -	*start = folio_pos(folio) + range_start_bit * fs_info->sectorsize;
>> -	*end = folio_pos(folio) + range_end_bit * fs_info->sectorsize;
>> +	/*
>> +	 * Although the PageDirty bit is cleared before entering this
>> +	 * function, subpage dirty bit is not cleared.
>> +	 * So clear subpage dirty bit here so next time we won't submit
>> +	 * folio for range already written to disk.
>> +	 */
>> +	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
>> +	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
>> +			    sectorsize, filepos - folio_pos(folio));
>> +	return 0;
>>   }
>>
>>   /*
>
David Sterba Aug. 7, 2024, 11:22 p.m. UTC | #5
On Thu, Aug 08, 2024 at 07:35:40AM +0930, Qu Wenruo wrote:
> 
> 
> 在 2024/8/8 01:57, David Sterba 写道:
> > On Wed, Aug 07, 2024 at 06:21:00PM +0930, Qu Wenruo wrote:
> [...]
> >
> >> +
> >> +	block_start = extent_map_block_start(em);
> >> +	disk_bytenr = extent_map_block_start(em) + extent_offset;
> >> +
> >> +	ASSERT(!extent_map_is_compressed(em));
> >> +	ASSERT(block_start != EXTENT_MAP_HOLE);
> >> +	ASSERT(block_start != EXTENT_MAP_INLINE);
> >> +
> >> +	free_extent_map(em);
> >> +	em = NULL;
> >> +
> >> +	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
> >> +	if (!folio_test_writeback(folio)) {
> >> +		btrfs_err(fs_info,
> >> +			  "folio %lu not writeback, cur %llu end %llu",
> >> +			  folio->index, filepos, filepos + sectorsize);
> >
> > This is copied from the original code but I wonder if this should be a
> > hard error or more visible as it looks like bad page state tracking.
> >
> 
> I believe this can be removed completely, or converted into ASSERT().
> 
> For subpage case, page writeback is set if any subpage sector has
> writeback flag set.
> 
> For regular case, it's setting the whole page writeback.
> 
> As long as the folio is already locked, the folio should always be
> marked writeback.
> 
> So I prefer to go the ASSERT() path.

Agreed, ASSERT makes sense.
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 040c92541bc9..6acd763e8f26 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1333,56 +1333,65 @@  static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 }
 
 /*
- * Find the first byte we need to write.
+ * Return 0 if we have submitted or queued the sector for submission.
+ * Return <0 for critical errors.
  *
- * For subpage, one page can contain several sectors, and
- * __extent_writepage_io() will just grab all extent maps in the page
- * range and try to submit all non-inline/non-compressed extents.
- *
- * This is a big problem for subpage, we shouldn't re-submit already written
- * data at all.
- * This function will lookup subpage dirty bit to find which range we really
- * need to submit.
- *
- * Return the next dirty range in [@start, @end).
- * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ * Caller should make sure filepos < i_size and handle filepos >= i_size case.
  */
-static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
-				 struct folio *folio, u64 *start, u64 *end)
+static int submit_one_sector(struct btrfs_inode *inode,
+			     struct folio *folio,
+			     u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
+			     loff_t i_size)
 {
-	struct btrfs_subpage *subpage = folio_get_private(folio);
-	struct btrfs_subpage_info *spi = fs_info->subpage_info;
-	u64 orig_start = *start;
-	/* Declare as unsigned long so we can use bitmap ops */
-	unsigned long flags;
-	int range_start_bit;
-	int range_end_bit;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct extent_map *em;
+	u64 block_start;
+	u64 disk_bytenr;
+	u64 extent_offset;
+	u64 em_end;
+	u32 sectorsize = fs_info->sectorsize;
 
-	/*
-	 * For regular sector size == page size case, since one page only
-	 * contains one sector, we return the page offset directly.
-	 */
-	if (!btrfs_is_subpage(fs_info, folio->mapping)) {
-		*start = folio_pos(folio);
-		*end = folio_pos(folio) + folio_size(folio);
-		return;
+	ASSERT(IS_ALIGNED(filepos, sectorsize));
+
+	/* @filepos >= i_size case should be handled by the caller. */
+	ASSERT(filepos < i_size);
+
+	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
+	if (IS_ERR(em))
+		return PTR_ERR_OR_ZERO(em);
+
+	extent_offset = filepos - em->start;
+	em_end = extent_map_end(em);
+	ASSERT(filepos <= em_end);
+	ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+	ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
+
+	block_start = extent_map_block_start(em);
+	disk_bytenr = extent_map_block_start(em) + extent_offset;
+
+	ASSERT(!extent_map_is_compressed(em));
+	ASSERT(block_start != EXTENT_MAP_HOLE);
+	ASSERT(block_start != EXTENT_MAP_INLINE);
+
+	free_extent_map(em);
+	em = NULL;
+
+	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+	if (!folio_test_writeback(folio)) {
+		btrfs_err(fs_info,
+			  "folio %lu not writeback, cur %llu end %llu",
+			  folio->index, filepos, filepos + sectorsize);
 	}
-
-	range_start_bit = spi->dirty_offset +
-			  (offset_in_folio(folio, orig_start) >>
-			   fs_info->sectorsize_bits);
-
-	/* We should have the page locked, but just in case */
-	spin_lock_irqsave(&subpage->lock, flags);
-	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
-			       spi->dirty_offset + spi->bitmap_nr_bits);
-	spin_unlock_irqrestore(&subpage->lock, flags);
-
-	range_start_bit -= spi->dirty_offset;
-	range_end_bit -= spi->dirty_offset;
-
-	*start = folio_pos(folio) + range_start_bit * fs_info->sectorsize;
-	*end = folio_pos(folio) + range_end_bit * fs_info->sectorsize;
+	/*
+	 * Although the PageDirty bit is cleared before entering this
+	 * function, subpage dirty bit is not cleared.
+	 * So clear subpage dirty bit here so next time we won't submit
+	 * folio for range already written to disk.
+	 */
+	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
+			    sectorsize, filepos - folio_pos(folio));
+	return 0;
 }
 
 /*
@@ -1401,10 +1410,10 @@  static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	u64 cur = start;
-	u64 end = start + len - 1;
-	u64 extent_offset;
-	u64 block_start;
-	struct extent_map *em;
+	unsigned long dirty_bitmap = 1;
+	unsigned long range_bitmap = 0;
+	unsigned int bitmap_size = 1;
+	int bit;
 	int ret = 0;
 	int nr = 0;
 
@@ -1419,18 +1428,23 @@  static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		return 1;
 	}
 
+	if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
+		ASSERT(fs_info->subpage_info);
+		btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap);
+		bitmap_size = fs_info->subpage_info->bitmap_nr_bits;
+	}
+	for (cur = start; cur < start + len; cur += fs_info->sectorsize)
+		set_bit((cur - start) >> fs_info->sectorsize_bits, &range_bitmap);
+	bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size);
+
 	bio_ctrl->end_io_func = end_bbio_data_write;
-	while (cur <= end) {
-		u32 len = end - cur + 1;
-		u64 disk_bytenr;
-		u64 em_end;
-		u64 dirty_range_start = cur;
-		u64 dirty_range_end;
-		u32 iosize;
+
+	for_each_set_bit(bit, &dirty_bitmap, bitmap_size) {
+		cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
 
 		if (cur >= i_size) {
-			btrfs_mark_ordered_io_finished(inode, folio, cur, len,
-						       true);
+			btrfs_mark_ordered_io_finished(inode, folio, cur,
+						       start + len - cur, true);
 			/*
 			 * This range is beyond i_size, thus we don't need to
 			 * bother writing back.
@@ -1439,63 +1453,13 @@  static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			 * writeback the sectors with subpage dirty bits,
 			 * causing writeback without ordered extent.
 			 */
-			btrfs_folio_clear_dirty(fs_info, folio, cur, len);
+			btrfs_folio_clear_dirty(fs_info, folio, cur,
+						start + len - cur);
 			break;
 		}
-
-		find_next_dirty_byte(fs_info, folio, &dirty_range_start,
-				     &dirty_range_end);
-		if (cur < dirty_range_start) {
-			cur = dirty_range_start;
-			continue;
-		}
-
-		em = btrfs_get_extent(inode, NULL, cur, len);
-		if (IS_ERR(em)) {
-			ret = PTR_ERR_OR_ZERO(em);
+		ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
+		if (ret < 0)
 			goto out_error;
-		}
-
-		extent_offset = cur - em->start;
-		em_end = extent_map_end(em);
-		ASSERT(cur <= em_end);
-		ASSERT(cur < end);
-		ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
-		ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
-
-		block_start = extent_map_block_start(em);
-		disk_bytenr = extent_map_block_start(em) + extent_offset;
-
-		ASSERT(!extent_map_is_compressed(em));
-		ASSERT(block_start != EXTENT_MAP_HOLE);
-		ASSERT(block_start != EXTENT_MAP_INLINE);
-
-		/*
-		 * Note that em_end from extent_map_end() and dirty_range_end from
-		 * find_next_dirty_byte() are all exclusive
-		 */
-		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
-		free_extent_map(em);
-		em = NULL;
-
-		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
-		if (!folio_test_writeback(folio)) {
-			btrfs_err(inode->root->fs_info,
-				   "folio %lu not writeback, cur %llu end %llu",
-			       folio->index, cur, end);
-		}
-
-		/*
-		 * Although the PageDirty bit is cleared before entering this
-		 * function, subpage dirty bit is not cleared.
-		 * So clear subpage dirty bit here so next time we won't submit
-		 * folio for range already written to disk.
-		 */
-		btrfs_folio_clear_dirty(fs_info, folio, cur, iosize);
-
-		submit_extent_folio(bio_ctrl, disk_bytenr, folio,
-				    iosize, cur - folio_pos(folio));
-		cur += iosize;
 		nr++;
 	}
 
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 631d96f1e905..eea3b2c6bbc4 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -940,3 +940,20 @@  void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 		    subpage_info->bitmap_nr_bits, &ordered_bitmap,
 		    subpage_info->bitmap_nr_bits, &checked_bitmap);
 }
+
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+				    struct folio *folio,
+				    unsigned long *ret_bitmap)
+{
+	struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
+	struct btrfs_subpage *subpage;
+	unsigned long flags;
+
+	ASSERT(folio_test_private(folio) && folio_get_private(folio));
+	ASSERT(subpage_info);
+	subpage = folio_get_private(folio);
+
+	spin_lock_irqsave(&subpage->lock, flags);
+	GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, ret_bitmap);
+	spin_unlock_irqrestore(&subpage->lock, flags);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5532cc4fac50..eee55e5a3952 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -175,6 +175,9 @@  void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
 				  struct folio *folio, u64 start, u32 len);
 void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
 			       struct folio *folio, u64 start, u32 len);
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+				    struct folio *folio,
+				    unsigned long *ret_bitmap);
 void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
 				      struct folio *folio, u64 start, u32 len);