[v2,1/3] btrfs: introduce a read path dedicated extent lock helper

Message ID	1b6921a745547f352f5b7e266ef28e864f2ce056.1739328504.git.wqu@suse.com (mailing list archive)
State	New
Headers	show Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.223.130]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 74ADC1D47AD for <linux-btrfs@vger.kernel.org>; Wed, 12 Feb 2025 02:53:13 +0000 (UTC) From: Qu Wenruo <wqu@suse.com> To: linux-btrfs@vger.kernel.org Subject: [PATCH v2 1/3] btrfs: introduce a read path dedicated extent lock helper Date: Wed, 12 Feb 2025 13:22:45 +1030 Message-ID: <1b6921a745547f352f5b7e266ef28e864f2ce056.1739328504.git.wqu@suse.com> In-Reply-To: <cover.1739328504.git.wqu@suse.com> References: <cover.1739328504.git.wqu@suse.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	btrfs: enhancement to pass generic/563 \| expand [v2,0/3] btrfs: enhancement to pass generic/563 [v2,1/3] btrfs: introduce a read path dedicated extent lock helper [v2,2/3] btrfs: make btrfs_do_readpage() to do block-by-block read [v2,3/3] btrfs: allow buffered write to avoid full page read if it's block aligned

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..d1330c138054 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -902,7 +902,7 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t break; folio_unlock(folio); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); folio_lock(folio); /* diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index b5190a010205..c98db5058967 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -103,7 +103,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 91b20dccef73..819d51c3ed57 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1075,6 +1075,185 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block + * at file pos @cur. + * + * Return true if we can skip to @next_ret. The caller needs to check + * the @next_ret value to make sure if covers the full range, before + * skipping the OE. + * + * Return false if we must wait for the ordered extent. + * + * @cur: The start file offset that we have locked folio for read. + * @next_ret: If we return true, this indiciates the next check start + * range. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *binode, + struct btrfs_ordered_extent *ordered, + u64 cur, u64 *next_ret) +{ + const struct btrfs_fs_info *fs_info = binode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 range_len; + bool ret; + + folio = filemap_get_folio(binode->vfs_inode.i_mapping, + cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * We several cases for the folio and OE combination: + * + * 0) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. Or direct IO. + * + * Have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into csum tree, read + * will just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * 1) The first block is DIRTY. + * + * This means the OE is created by some folio before us, but writeback + * has not started. + * We can and must skip the whole OE, because it will never start until + * we finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *next_ret = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 2) The first block is uptodate. + * + * At least the first block can be skipped, but we are still + * not full sure. E.g. if the OE has some other folios in + * the range that can not be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *next_ret = cur + range_len; + goto out; + } + + /* + * 3) The first block is not uptodate. + * + * This means the folio is invalidated after the OE finished, or direct IO. + * Very much the same as case 1), just with private flag set. + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *binode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 range_start = max(start, ordered->file_offset); + u64 cur = range_start; + + while (cur < range_end) { + bool can_skip; + u64 next_start; + + can_skip = can_skip_one_ordered_range(binode, ordered, cur, + &next_start); + if (!can_skip) + return false; + cur = next_start; + } + return true; +} + +/* + * To make sure we get a stable view of extent maps for the involved range. + * This is for folio read paths (read and readahead), thus involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *binode, u64 start, u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* + * The range must at least be page aligned, as all read paths + * are folio based. + */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE) && IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + lock_extent(&binode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + ordered = btrfs_lookup_ordered_range(binode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the + * extent lock, no one can modify the extent maps + * in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(binode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + unlock_extent(&binode->io_tree, start, end, + cached_state); + btrfs_start_ordered_extent(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1085,7 +1264,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -2375,7 +2554,7 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 579706fab9b4..81e6cb599585 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -942,7 +942,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, cached_state); folio_unlock(folio); folio_put(folio); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1846,7 +1846,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); goto again; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 32895aabf0ff..eaf53408254d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2821,7 +2821,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); folio_unlock(folio); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); goto again; } @@ -4876,7 +4876,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); goto again; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..6075a6fa4817 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -729,7 +729,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); complete(&ordered->completion); } @@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -921,7 +934,7 @@ int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -1174,7 +1187,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(ordered); + btrfs_start_ordered_extent(ordered, 0, 0); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..d7cf69647434 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -191,7 +191,8 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);

[v2,1/3] btrfs: introduce a read path dedicated extent lock helper

Commit Message

Patch