diff mbox series

[2/2] btrfs: reduce scope of extent locks during buffered write

Message ID f50af2f3cbcfc390265a09ac1962a8afb1b5c22d.1724847323.git.rgoldwyn@suse.com (mailing list archive)
State New
Headers show
Series Limit scope of extent locks in btrfs_buffered_write() | expand

Commit Message

Goldwyn Rodrigues Aug. 28, 2024, 12:45 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Reduce the scope of locking while performing writes. The scope is
limited to changing extent bits, which is in btrfs_dirty_pages().
btrfs_dirty_pages() is also called from __btrfs_write_out_cache(), and
it expects extent locks held. So, perform extent locking around
btrfs_dirty_pages() only.

The inode lock will insure that no other process is writing to this
file, so we don't need to worry about multiple processes dirtying
folios.

However, the write has to make sure that there are no ordered extents in
the range specified. So, call btrfs_wait_ordered_range() before
initiating the write. In case of nowait, bail if there exists an
ordered range within the write range.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/file.c | 109 +++++++-----------------------------------------
 1 file changed, 16 insertions(+), 93 deletions(-)

Comments

Filipe Manana Aug. 28, 2024, 1:17 p.m. UTC | #1
On Wed, Aug 28, 2024 at 1:54 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote:
>
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> Reduce the scope of locking while performing writes. The scope is
> limited to changing extent bits, which is in btrfs_dirty_pages().
> btrfs_dirty_pages() is also called from __btrfs_write_out_cache(), and
> it expects extent locks held. So, perform extent locking around
> btrfs_dirty_pages() only.
>
> The inode lock will insure that no other process is writing to this
> file, so we don't need to worry about multiple processes dirtying
> folios.
>
> However, the write has to make sure that there are no ordered extents in
> the range specified. So, call btrfs_wait_ordered_range() before
> initiating the write. In case of nowait, bail if there exists an
> ordered range within the write range.

if there exists an ordered range -> if there exists an ordered extent

>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/file.c | 109 +++++++-----------------------------------------
>  1 file changed, 16 insertions(+), 93 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 76f4cc686af9..6c5f712bfa0f 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -976,83 +976,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
>
>  }
>
> -/*
> - * This function locks the extent and properly waits for data=ordered extents
> - * to finish before allowing the pages to be modified if need.
> - *
> - * The return value:
> - * 1 - the extent is locked
> - * 0 - the extent is not locked, and everything is OK
> - * -EAGAIN - need re-prepare the pages
> - * the other < 0 number - Something wrong happens
> - */
> -static noinline int
> -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
> -                               size_t num_pages, loff_t pos,
> -                               size_t write_bytes,
> -                               u64 *lockstart, u64 *lockend, bool nowait,
> -                               struct extent_state **cached_state)
> -{
> -       struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -       u64 start_pos;
> -       u64 last_pos;
> -       int i;
> -       int ret = 0;
> -
> -       start_pos = round_down(pos, fs_info->sectorsize);
> -       last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
> -
> -       if (start_pos < inode->vfs_inode.i_size) {
> -               struct btrfs_ordered_extent *ordered;
> -
> -               if (nowait) {
> -                       if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
> -                                            cached_state)) {
> -                               for (i = 0; i < num_pages; i++) {
> -                                       unlock_page(pages[i]);
> -                                       put_page(pages[i]);
> -                                       pages[i] = NULL;
> -                               }
> -
> -                               return -EAGAIN;
> -                       }
> -               } else {
> -                       lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
> -               }
> -
> -               ordered = btrfs_lookup_ordered_range(inode, start_pos,
> -                                                    last_pos - start_pos + 1);
> -               if (ordered &&
> -                   ordered->file_offset + ordered->num_bytes > start_pos &&
> -                   ordered->file_offset <= last_pos) {
> -                       unlock_extent(&inode->io_tree, start_pos, last_pos,
> -                                     cached_state);
> -                       for (i = 0; i < num_pages; i++) {
> -                               unlock_page(pages[i]);
> -                               put_page(pages[i]);
> -                       }
> -                       btrfs_start_ordered_extent(ordered);
> -                       btrfs_put_ordered_extent(ordered);
> -                       return -EAGAIN;
> -               }
> -               if (ordered)
> -                       btrfs_put_ordered_extent(ordered);
> -
> -               *lockstart = start_pos;
> -               *lockend = last_pos;
> -               ret = 1;
> -       }
> -
> -       /*
> -        * We should be called after prepare_pages() which should have locked
> -        * all pages in the range.
> -        */
> -       for (i = 0; i < num_pages; i++)
> -               WARN_ON(!PageLocked(pages[i]));
> -
> -       return ret;
> -}
> -
>  /*
>   * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
>   *
> @@ -1246,7 +1169,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 size_t copied;
>                 size_t dirty_sectors;
>                 size_t num_sectors;
> -               int extents_locked;
> +               int extents_locked = false;

This is an int. So either assign to 0 or change the type to bool (preferable).

>
>                 /*
>                  * Fault pages before locking them in prepare_pages
> @@ -1310,13 +1233,19 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 }
>
>                 release_bytes = reserve_bytes;
> -again:
>                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
>                 if (ret) {
>                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
>                         break;
>                 }
>
> +               if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) {

The logic is not correct, the ! should be removed.
I.e. if it's a nowait write and there are ordered extents, we want to
stop because it would make us block waiting for ordered extents.

> +                       btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
> +                       break;

Before the break we also need a:  ret = -EAGAIN

We should also prevent looking up ordered extents if the write starts
at or beyond i_size, just like before this patch.

> +               } else {
> +                       btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes);
> +               }
> +
>                 /*
>                  * This is going to setup the pages array with the number of
>                  * pages we want, so we don't really need to worry about the
> @@ -1330,20 +1259,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                         break;
>                 }
>
> -               extents_locked = lock_and_cleanup_extent_if_need(
> -                               BTRFS_I(inode), pages,
> -                               num_pages, pos, write_bytes, &lockstart,
> -                               &lockend, nowait, &cached_state);
> -               if (extents_locked < 0) {
> -                       if (!nowait && extents_locked == -EAGAIN)
> -                               goto again;
> -
> -                       btrfs_delalloc_release_extents(BTRFS_I(inode),
> -                                                      reserve_bytes);
> -                       ret = extents_locked;
> -                       break;
> -               }
> -
>                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
>
>                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
> @@ -1389,6 +1304,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 release_bytes = round_up(copied + sector_offset,
>                                         fs_info->sectorsize);
>
> +               if (pos < inode->i_size) {
> +                       lockstart = round_down(pos, fs_info->sectorsize);
> +                       lockend = round_up(pos + copied, fs_info->sectorsize);
> +                       lock_extent(&BTRFS_I(inode)->io_tree, lockstart,
> +                                     lockend, &cached_state);

We should respect the nowait semantics and do a try_lock_extent() if
we're a nowait write.


> +                       extents_locked = true;

Same here, the type is int and not bool.

Thanks.

> +               }
> +
>                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
>                                         dirty_pages, pos, copied,
>                                         &cached_state, only_release_metadata);
> --
> 2.46.0
>
>
Filipe Manana Aug. 28, 2024, 2:21 p.m. UTC | #2
On Wed, Aug 28, 2024 at 1:54 PM Goldwyn Rodrigues <rgoldwyn@suse.de> wrote:
>
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> Reduce the scope of locking while performing writes. The scope is
> limited to changing extent bits, which is in btrfs_dirty_pages().
> btrfs_dirty_pages() is also called from __btrfs_write_out_cache(), and
> it expects extent locks held. So, perform extent locking around
> btrfs_dirty_pages() only.

There's one fundamental problem I missed before. Comments inlined below.

>
> The inode lock will insure that no other process is writing to this
> file, so we don't need to worry about multiple processes dirtying
> folios.

Well, there's an exception: memory mapped writes, as they don't take
the inode lock.

So this patch introduces a race, see below.

>
> However, the write has to make sure that there are no ordered extents in
> the range specified. So, call btrfs_wait_ordered_range() before
> initiating the write. In case of nowait, bail if there exists an
> ordered range within the write range.
>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/file.c | 109 +++++++-----------------------------------------
>  1 file changed, 16 insertions(+), 93 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 76f4cc686af9..6c5f712bfa0f 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -976,83 +976,6 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
>
>  }
>
> -/*
> - * This function locks the extent and properly waits for data=ordered extents
> - * to finish before allowing the pages to be modified if need.
> - *
> - * The return value:
> - * 1 - the extent is locked
> - * 0 - the extent is not locked, and everything is OK
> - * -EAGAIN - need re-prepare the pages
> - * the other < 0 number - Something wrong happens
> - */
> -static noinline int
> -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
> -                               size_t num_pages, loff_t pos,
> -                               size_t write_bytes,
> -                               u64 *lockstart, u64 *lockend, bool nowait,
> -                               struct extent_state **cached_state)
> -{
> -       struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -       u64 start_pos;
> -       u64 last_pos;
> -       int i;
> -       int ret = 0;
> -
> -       start_pos = round_down(pos, fs_info->sectorsize);
> -       last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
> -
> -       if (start_pos < inode->vfs_inode.i_size) {
> -               struct btrfs_ordered_extent *ordered;
> -
> -               if (nowait) {
> -                       if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
> -                                            cached_state)) {
> -                               for (i = 0; i < num_pages; i++) {
> -                                       unlock_page(pages[i]);
> -                                       put_page(pages[i]);
> -                                       pages[i] = NULL;
> -                               }
> -
> -                               return -EAGAIN;
> -                       }
> -               } else {
> -                       lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
> -               }
> -
> -               ordered = btrfs_lookup_ordered_range(inode, start_pos,
> -                                                    last_pos - start_pos + 1);
> -               if (ordered &&
> -                   ordered->file_offset + ordered->num_bytes > start_pos &&
> -                   ordered->file_offset <= last_pos) {
> -                       unlock_extent(&inode->io_tree, start_pos, last_pos,
> -                                     cached_state);
> -                       for (i = 0; i < num_pages; i++) {
> -                               unlock_page(pages[i]);
> -                               put_page(pages[i]);
> -                       }
> -                       btrfs_start_ordered_extent(ordered);
> -                       btrfs_put_ordered_extent(ordered);
> -                       return -EAGAIN;
> -               }
> -               if (ordered)
> -                       btrfs_put_ordered_extent(ordered);
> -
> -               *lockstart = start_pos;
> -               *lockend = last_pos;
> -               ret = 1;
> -       }
> -
> -       /*
> -        * We should be called after prepare_pages() which should have locked
> -        * all pages in the range.
> -        */
> -       for (i = 0; i < num_pages; i++)
> -               WARN_ON(!PageLocked(pages[i]));
> -
> -       return ret;
> -}
> -
>  /*
>   * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
>   *
> @@ -1246,7 +1169,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 size_t copied;
>                 size_t dirty_sectors;
>                 size_t num_sectors;
> -               int extents_locked;
> +               int extents_locked = false;
>
>                 /*
>                  * Fault pages before locking them in prepare_pages
> @@ -1310,13 +1233,19 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 }
>
>                 release_bytes = reserve_bytes;
> -again:
>                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
>                 if (ret) {
>                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
>                         break;
>                 }
>
> +               if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) {
> +                       btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
> +                       break;
> +               } else {
> +                       btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes);

So after waiting for ordered extents, we call prepare_pages() below,
which is what gets and locks the pages.

But after this wait and before we lock the pages, a mmap write can
happen, it dirties pages and if after it completes and before we lock
the pages at prepare_pages(), delalloc is flushed, an ordered extent
for the range is created - and we will miss it here and not wait for
it to complete.

Before this patch that couldn't happen, as we always lock the extent
range before locking pages.

Thanks.


> +               }
> +
>                 /*
>                  * This is going to setup the pages array with the number of
>                  * pages we want, so we don't really need to worry about the
> @@ -1330,20 +1259,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                         break;
>                 }
>
> -               extents_locked = lock_and_cleanup_extent_if_need(
> -                               BTRFS_I(inode), pages,
> -                               num_pages, pos, write_bytes, &lockstart,
> -                               &lockend, nowait, &cached_state);
> -               if (extents_locked < 0) {
> -                       if (!nowait && extents_locked == -EAGAIN)
> -                               goto again;
> -
> -                       btrfs_delalloc_release_extents(BTRFS_I(inode),
> -                                                      reserve_bytes);
> -                       ret = extents_locked;
> -                       break;
> -               }
> -
>                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
>
>                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
> @@ -1389,6 +1304,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
>                 release_bytes = round_up(copied + sector_offset,
>                                         fs_info->sectorsize);
>
> +               if (pos < inode->i_size) {
> +                       lockstart = round_down(pos, fs_info->sectorsize);
> +                       lockend = round_up(pos + copied, fs_info->sectorsize);
> +                       lock_extent(&BTRFS_I(inode)->io_tree, lockstart,
> +                                     lockend, &cached_state);
> +                       extents_locked = true;
> +               }
> +
>                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
>                                         dirty_pages, pos, copied,
>                                         &cached_state, only_release_metadata);
> --
> 2.46.0
>
>
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 76f4cc686af9..6c5f712bfa0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -976,83 +976,6 @@  static noinline int prepare_pages(struct inode *inode, struct page **pages,
 
 }
 
-/*
- * This function locks the extent and properly waits for data=ordered extents
- * to finish before allowing the pages to be modified if need.
- *
- * The return value:
- * 1 - the extent is locked
- * 0 - the extent is not locked, and everything is OK
- * -EAGAIN - need re-prepare the pages
- * the other < 0 number - Something wrong happens
- */
-static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
-				size_t num_pages, loff_t pos,
-				size_t write_bytes,
-				u64 *lockstart, u64 *lockend, bool nowait,
-				struct extent_state **cached_state)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	u64 start_pos;
-	u64 last_pos;
-	int i;
-	int ret = 0;
-
-	start_pos = round_down(pos, fs_info->sectorsize);
-	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
-
-	if (start_pos < inode->vfs_inode.i_size) {
-		struct btrfs_ordered_extent *ordered;
-
-		if (nowait) {
-			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
-					     cached_state)) {
-				for (i = 0; i < num_pages; i++) {
-					unlock_page(pages[i]);
-					put_page(pages[i]);
-					pages[i] = NULL;
-				}
-
-				return -EAGAIN;
-			}
-		} else {
-			lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
-		}
-
-		ordered = btrfs_lookup_ordered_range(inode, start_pos,
-						     last_pos - start_pos + 1);
-		if (ordered &&
-		    ordered->file_offset + ordered->num_bytes > start_pos &&
-		    ordered->file_offset <= last_pos) {
-			unlock_extent(&inode->io_tree, start_pos, last_pos,
-				      cached_state);
-			for (i = 0; i < num_pages; i++) {
-				unlock_page(pages[i]);
-				put_page(pages[i]);
-			}
-			btrfs_start_ordered_extent(ordered);
-			btrfs_put_ordered_extent(ordered);
-			return -EAGAIN;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-
-		*lockstart = start_pos;
-		*lockend = last_pos;
-		ret = 1;
-	}
-
-	/*
-	 * We should be called after prepare_pages() which should have locked
-	 * all pages in the range.
-	 */
-	for (i = 0; i < num_pages; i++)
-		WARN_ON(!PageLocked(pages[i]));
-
-	return ret;
-}
-
 /*
  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
  *
@@ -1246,7 +1169,7 @@  ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		size_t copied;
 		size_t dirty_sectors;
 		size_t num_sectors;
-		int extents_locked;
+		int extents_locked = false;
 
 		/*
 		 * Fault pages before locking them in prepare_pages
@@ -1310,13 +1233,19 @@  ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		}
 
 		release_bytes = reserve_bytes;
-again:
 		ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
 		if (ret) {
 			btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 			break;
 		}
 
+		if (nowait && !btrfs_has_ordered_extent(BTRFS_I(inode), pos, write_bytes)) {
+			btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+			break;
+		} else {
+			btrfs_wait_ordered_range(BTRFS_I(inode), pos, write_bytes);
+		}
+
 		/*
 		 * This is going to setup the pages array with the number of
 		 * pages we want, so we don't really need to worry about the
@@ -1330,20 +1259,6 @@  ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 			break;
 		}
 
-		extents_locked = lock_and_cleanup_extent_if_need(
-				BTRFS_I(inode), pages,
-				num_pages, pos, write_bytes, &lockstart,
-				&lockend, nowait, &cached_state);
-		if (extents_locked < 0) {
-			if (!nowait && extents_locked == -EAGAIN)
-				goto again;
-
-			btrfs_delalloc_release_extents(BTRFS_I(inode),
-						       reserve_bytes);
-			ret = extents_locked;
-			break;
-		}
-
 		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
 
 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
@@ -1389,6 +1304,14 @@  ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
 		release_bytes = round_up(copied + sector_offset,
 					fs_info->sectorsize);
 
+		if (pos < inode->i_size) {
+			lockstart = round_down(pos, fs_info->sectorsize);
+			lockend = round_up(pos + copied, fs_info->sectorsize);
+			lock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+				      lockend, &cached_state);
+			extents_locked = true;
+		}
+
 		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
 					dirty_pages, pos, copied,
 					&cached_state, only_release_metadata);