Message ID | f50af2f3cbcfc390265a09ac1962a8afb1b5c22d.1724847323.git.rgoldwyn@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Limit scope of extent locks in btrfs_buffered_write() | expand |
On Wed, Aug 28, 2024 at 1:54 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > Reduce the scope of locking while performing writes. The scope is > limited to changing extent bits, which is in btrfs_dirty_pages(). > btrfs_dirty_pages() is also called from __btrfs_write_out_cache(), and > it expects extent locks held. So, perform extent locking around > btrfs_dirty_pages() only. > > The inode lock will insure that no other process is writing to this > file, so we don't need to worry about multiple processes dirtying > folios. > > However, the write has to make sure that there are no ordered extents in > the range specified. So, call btrfs_wait_ordered_range() before > initiating the write. In case of nowait, bail if there exists an > ordered range within the write range. if there exists an ordered range -> if there exists an ordered extent > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > --- > fs/btrfs/file.c | 109 +++++++----------------------------------------- > 1 file changed, 16 insertions(+), 93 deletions(-) > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 76f4cc686af9..6c5f712bfa0f 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -976,83 +976,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, > > } > > -/* > - * This function locks the extent and properly waits for data=ordered extents > - * to finish before allowing the pages to be modified if need. > - * > - * The return value: > - * 1 - the extent is locked > - * 0 - the extent is not locked, and everything is OK > - * -EAGAIN - need re-prepare the pages > - * the other < 0 number - Something wrong happens > - */ > -static noinline int > -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, > - size_t num_pages, loff_t pos, > - size_t write_bytes, > - u64 *lockstart, u64 *lockend, bool nowait, > - struct extent_state **cached_state) > -{ > - struct btrfs_fs_info *fs_info = inode->root->fs_info; > - u64 start_pos; > - u64 last_pos; > - int i; > - int ret = 0; > - > - start_pos = round_down(pos, fs_info->sectorsize); > - last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; > - > - if (start_pos < inode->vfs_inode.i_size) { > - struct btrfs_ordered_extent *ordered; > - > - if (nowait) { > - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, > - cached_state)) { > - for (i = 0; i < num_pages; i++) { > - unlock_page(pages[i]); > - put_page(pages[i]); > - pages[i] = NULL; > - } > - > - return -EAGAIN; > - } > - } else { > - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); > - } > - > - ordered = btrfs_lookup_ordered_range(inode, start_pos, > - last_pos - start_pos + 1); > - if (ordered && > - ordered->file_offset + ordered->num_bytes > start_pos && > - ordered->file_offset <= last_pos) { > - unlock_extent(&inode->io_tree, start_pos, last_pos, > - cached_state); > - for (i = 0; i < num_pages; i++) { > - unlock_page(pages[i]); > - put_page(pages[i]); > - } > - btrfs_start_ordered_extent(ordered); > - btrfs_put_ordered_extent(ordered); > - return -EAGAIN; > - } > - if (ordered) > - btrfs_put_ordered_extent(ordered); > - > - *lockstart = start_pos; > - *lockend = last_pos; > - ret = 1; > - } > - > - /* > - * We should be called after prepare_pages() which should have locked > - * all pages in the range. > - */ > - for (i = 0; i < num_pages; i++) > - WARN_ON(!PageLocked(pages[i])); > - > - return ret; > -} > - > /* > * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) > * > @@ -1246,7 +1169,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > size_t copied; > size_t dirty_sectors; > size_t num_sectors; > - int extents_locked; > + int extents_locked = false; This is an int. So either assign to 0 or change the type to bool (preferable). > > /* > * Fault pages before locking them in prepare_pages > @@ -1310,13 +1233,19 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > } > > release_bytes = reserve_bytes; > -again: > ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); > if (ret) { > btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); > break; > } > > + if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) { The logic is not correct, the ! should be removed. I.e. if it's a nowait write and there are ordered extents, we want to stop because it would make us block waiting for ordered extents. > + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); > + break; Before the break we also need a: ret = -EAGAIN We should also prevent looking up ordered extents if the write starts at or beyond i_size, just like before this patch. > + } else { > + btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes); > + } > + > /* > * This is going to setup the pages array with the number of > * pages we want, so we don't really need to worry about the > @@ -1330,20 +1259,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > break; > } > > - extents_locked = lock_and_cleanup_extent_if_need( > - BTRFS_I(inode), pages, > - num_pages, pos, write_bytes, &lockstart, > - &lockend, nowait, &cached_state); > - if (extents_locked < 0) { > - if (!nowait && extents_locked == -EAGAIN) > - goto again; > - > - btrfs_delalloc_release_extents(BTRFS_I(inode), > - reserve_bytes); > - ret = extents_locked; > - break; > - } > - > copied = btrfs_copy_from_user(pos, write_bytes, pages, i); > > num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); > @@ -1389,6 +1304,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > release_bytes = round_up(copied + sector_offset, > fs_info->sectorsize); > > + if (pos < inode->i_size) { > + lockstart = round_down(pos, fs_info->sectorsize); > + lockend = round_up(pos + copied, fs_info->sectorsize); > + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, > + lockend, &cached_state); We should respect the nowait semantics and do a try_lock_extent() if we're a nowait write. > + extents_locked = true; Same here, the type is int and not bool. Thanks. > + } > + > ret = btrfs_dirty_pages(BTRFS_I(inode), pages, > dirty_pages, pos, copied, > &cached_state, only_release_metadata); > -- > 2.46.0 > >
On Wed, Aug 28, 2024 at 1:54 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > Reduce the scope of locking while performing writes. The scope is > limited to changing extent bits, which is in btrfs_dirty_pages(). > btrfs_dirty_pages() is also called from __btrfs_write_out_cache(), and > it expects extent locks held. So, perform extent locking around > btrfs_dirty_pages() only. There's one fundamental problem I missed before. Comments inlined below. > > The inode lock will insure that no other process is writing to this > file, so we don't need to worry about multiple processes dirtying > folios. Well, there's an exception: memory mapped writes, as they don't take the inode lock. So this patch introduces a race, see below. > > However, the write has to make sure that there are no ordered extents in > the range specified. So, call btrfs_wait_ordered_range() before > initiating the write. In case of nowait, bail if there exists an > ordered range within the write range. > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > --- > fs/btrfs/file.c | 109 +++++++----------------------------------------- > 1 file changed, 16 insertions(+), 93 deletions(-) > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 76f4cc686af9..6c5f712bfa0f 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -976,83 +976,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, > > } > > -/* > - * This function locks the extent and properly waits for data=ordered extents > - * to finish before allowing the pages to be modified if need. > - * > - * The return value: > - * 1 - the extent is locked > - * 0 - the extent is not locked, and everything is OK > - * -EAGAIN - need re-prepare the pages > - * the other < 0 number - Something wrong happens > - */ > -static noinline int > -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, > - size_t num_pages, loff_t pos, > - size_t write_bytes, > - u64 *lockstart, u64 *lockend, bool nowait, > - struct extent_state **cached_state) > -{ > - struct btrfs_fs_info *fs_info = inode->root->fs_info; > - u64 start_pos; > - u64 last_pos; > - int i; > - int ret = 0; > - > - start_pos = round_down(pos, fs_info->sectorsize); > - last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; > - > - if (start_pos < inode->vfs_inode.i_size) { > - struct btrfs_ordered_extent *ordered; > - > - if (nowait) { > - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, > - cached_state)) { > - for (i = 0; i < num_pages; i++) { > - unlock_page(pages[i]); > - put_page(pages[i]); > - pages[i] = NULL; > - } > - > - return -EAGAIN; > - } > - } else { > - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); > - } > - > - ordered = btrfs_lookup_ordered_range(inode, start_pos, > - last_pos - start_pos + 1); > - if (ordered && > - ordered->file_offset + ordered->num_bytes > start_pos && > - ordered->file_offset <= last_pos) { > - unlock_extent(&inode->io_tree, start_pos, last_pos, > - cached_state); > - for (i = 0; i < num_pages; i++) { > - unlock_page(pages[i]); > - put_page(pages[i]); > - } > - btrfs_start_ordered_extent(ordered); > - btrfs_put_ordered_extent(ordered); > - return -EAGAIN; > - } > - if (ordered) > - btrfs_put_ordered_extent(ordered); > - > - *lockstart = start_pos; > - *lockend = last_pos; > - ret = 1; > - } > - > - /* > - * We should be called after prepare_pages() which should have locked > - * all pages in the range. > - */ > - for (i = 0; i < num_pages; i++) > - WARN_ON(!PageLocked(pages[i])); > - > - return ret; > -} > - > /* > * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) > * > @@ -1246,7 +1169,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > size_t copied; > size_t dirty_sectors; > size_t num_sectors; > - int extents_locked; > + int extents_locked = false; > > /* > * Fault pages before locking them in prepare_pages > @@ -1310,13 +1233,19 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > } > > release_bytes = reserve_bytes; > -again: > ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); > if (ret) { > btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); > break; > } > > + if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) { > + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); > + break; > + } else { > + btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes); So after waiting for ordered extents, we call prepare_pages() below, which is what gets and locks the pages. But after this wait and before we lock the pages, a mmap write can happen, it dirties pages and if after it completes and before we lock the pages at prepare_pages(), delalloc is flushed, an ordered extent for the range is created - and we will miss it here and not wait for it to complete. Before this patch that couldn't happen, as we always lock the extent range before locking pages. Thanks. > + } > + > /* > * This is going to setup the pages array with the number of > * pages we want, so we don't really need to worry about the > @@ -1330,20 +1259,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > break; > } > > - extents_locked = lock_and_cleanup_extent_if_need( > - BTRFS_I(inode), pages, > - num_pages, pos, write_bytes, &lockstart, > - &lockend, nowait, &cached_state); > - if (extents_locked < 0) { > - if (!nowait && extents_locked == -EAGAIN) > - goto again; > - > - btrfs_delalloc_release_extents(BTRFS_I(inode), > - reserve_bytes); > - ret = extents_locked; > - break; > - } > - > copied = btrfs_copy_from_user(pos, write_bytes, pages, i); > > num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); > @@ -1389,6 +1304,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) > release_bytes = round_up(copied + sector_offset, > fs_info->sectorsize); > > + if (pos < inode->i_size) { > + lockstart = round_down(pos, fs_info->sectorsize); > + lockend = round_up(pos + copied, fs_info->sectorsize); > + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, > + lockend, &cached_state); > + extents_locked = true; > + } > + > ret = btrfs_dirty_pages(BTRFS_I(inode), pages, > dirty_pages, pos, copied, > &cached_state, only_release_metadata); > -- > 2.46.0 > >
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 76f4cc686af9..6c5f712bfa0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -976,83 +976,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, } -/* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. - * - * The return value: - * 1 - the extent is locked - * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens - */ -static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, - u64 *lockstart, u64 *lockend, bool nowait, - struct extent_state **cached_state) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 start_pos; - u64 last_pos; - int i; - int ret = 0; - - start_pos = round_down(pos, fs_info->sectorsize); - last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; - - if (start_pos < inode->vfs_inode.i_size) { - struct btrfs_ordered_extent *ordered; - - if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - - return -EAGAIN; - } - } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); - } - - ordered = btrfs_lookup_ordered_range(inode, start_pos, - last_pos - start_pos + 1); - if (ordered && - ordered->file_offset + ordered->num_bytes > start_pos && - ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - return -EAGAIN; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - - *lockstart = start_pos; - *lockend = last_pos; - ret = 1; - } - - /* - * We should be called after prepare_pages() which should have locked - * all pages in the range. - */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); - - return ret; -} - /* * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) * @@ -1246,7 +1169,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) size_t copied; size_t dirty_sectors; size_t num_sectors; - int extents_locked; + int extents_locked = false; /* * Fault pages before locking them in prepare_pages @@ -1310,13 +1233,19 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) } release_bytes = reserve_bytes; -again: ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; } + if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + break; + } else { + btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes); + } + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the @@ -1330,20 +1259,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) break; } - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; - - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); @@ -1389,6 +1304,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); + if (pos < inode->i_size) { + lockstart = round_down(pos, fs_info->sectorsize); + lockend = round_up(pos + copied, fs_info->sectorsize); + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); + extents_locked = true; + } + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, dirty_pages, pos, copied, &cached_state, only_release_metadata);