diff mbox series

[v3] btrfs: reduce lock contention when eb cache miss for btree search

Message ID 20241015074137.26067-1-robbieko@synology.com (mailing list archive)
State New
Headers show
Series [v3] btrfs: reduce lock contention when eb cache miss for btree search | expand

Commit Message

robbieko Oct. 15, 2024, 7:41 a.m. UTC
From: Robbie Ko <robbieko@synology.com>

When crawling btree, if an eb cache miss occurs, we change to use
the eb read lock and release all previous locks (include parent lock)
to reduce lock contention.

If a eb cache miss occurs in a leaf and needs to execute IO, before
this change we released locks only from level 2 and up and we
read a leaf's content from disk while holding a lock on its parent
(level 1), causing the unnecessary lock contention on the parent,
after this change we release locks from level 1 and up, but we lock
level 0, and read leaf's content from disk.

Because we have prepared the check parameters and the read lock
of eb we hold, we can ensure that no race will occur during the check
and cause unexpected errors.

Signed-off-by: Robbie Ko <robbieko@synology.com>
---
v3: Improve the change log, and change variable named
v2: Add skip_locking handle
 fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 31 deletions(-)

Comments

Filipe Manana Oct. 15, 2024, 9:51 a.m. UTC | #1
On Tue, Oct 15, 2024 at 8:41 AM robbieko <robbieko@synology.com> wrote:
>
> From: Robbie Ko <robbieko@synology.com>
>
> When crawling btree, if an eb cache miss occurs, we change to use
> the eb read lock and release all previous locks (include parent lock)
> to reduce lock contention.
>
> If a eb cache miss occurs in a leaf and needs to execute IO, before
> this change we released locks only from level 2 and up and we
> read a leaf's content from disk while holding a lock on its parent
> (level 1), causing the unnecessary lock contention on the parent,
> after this change we release locks from level 1 and up, but we lock
> level 0, and read leaf's content from disk.
>
> Because we have prepared the check parameters and the read lock
> of eb we hold, we can ensure that no race will occur during the check
> and cause unexpected errors.
>
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> ---
> v3: Improve the change log, and change variable named
> v2: Add skip_locking handle
>  fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
>  1 file changed, 70 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 0cc919d15b14..dc1d4e05214a 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>         struct btrfs_tree_parent_check check = { 0 };
>         u64 blocknr;
>         u64 gen;
> -       struct extent_buffer *tmp;
> -       int ret;
> +       struct extent_buffer *tmp = NULL;
> +       int ret = 0;
>         int parent_level;
> -       bool unlock_up;
> +       int err;
> +       bool read_tmp = false;
> +       bool tmp_locked = false;
> +       bool path_released = false;
>
> -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>         blocknr = btrfs_node_blockptr(*eb_ret, slot);
>         gen = btrfs_node_ptr_generation(*eb_ret, slot);
>         parent_level = btrfs_header_level(*eb_ret);
> @@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>                          */
>                         if (btrfs_verify_level_key(tmp,
>                                         parent_level - 1, &check.first_key, gen)) {
> -                               free_extent_buffer(tmp);
> -                               return -EUCLEAN;
> +                               ret = -EUCLEAN;
> +                               goto out;
>                         }
>                         *eb_ret = tmp;
> -                       return 0;
> +                       tmp = NULL;
> +                       ret = 0;
> +                       goto out;
>                 }
>
>                 if (p->nowait) {
> -                       free_extent_buffer(tmp);
> -                       return -EAGAIN;
> +                       ret = -EAGAIN;
> +                       goto out;
>                 }
>
> -               if (unlock_up)
> +               if (!p->skip_locking) {
>                         btrfs_unlock_up_safe(p, level + 1);
> -
> -               /* now we're allowed to do a blocking uptodate check */
> -               ret = btrfs_read_extent_buffer(tmp, &check);
> -               if (ret) {
> -                       free_extent_buffer(tmp);
> +                       tmp_locked = true;
> +                       btrfs_tree_read_lock(tmp);
>                         btrfs_release_path(p);
> -                       return ret;
> +                       ret = -EAGAIN;
> +                       path_released = true;
>                 }
>
> -               if (unlock_up)
> -                       ret = -EAGAIN;
> +               /* Now we're allowed to do a blocking uptodate check. */
> +               err = btrfs_read_extent_buffer(tmp, &check);
> +               if (err) {
> +                       ret = err;
> +                       goto out;
> +               }
>
> +               if (ret == 0) {
> +                       ASSERT(!tmp_locked);
> +                       *eb_ret = tmp;
> +                       tmp = NULL;
> +               }
>                 goto out;
>         } else if (p->nowait) {
> -               return -EAGAIN;
> +               ret = -EAGAIN;
> +               goto out;
>         }
>
> -       if (unlock_up) {
> +       if (!p->skip_locking) {
>                 btrfs_unlock_up_safe(p, level + 1);
>                 ret = -EAGAIN;
> -       } else {
> -               ret = 0;
>         }
>
>         if (p->reada != READA_NONE)
>                 reada_for_search(fs_info, p, level, slot, key->objectid);
>
> -       tmp = read_tree_block(fs_info, blocknr, &check);
> +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
>         if (IS_ERR(tmp)) {
> +               ret = PTR_ERR(tmp);
> +               tmp = NULL;
> +               goto out;
> +       }
> +       read_tmp = true;
> +
> +       if (!p->skip_locking) {
> +               ASSERT(ret);

We can be more explicit here and do instead:

ASSERT(ret == -EAGAIN);

The patch looks good to me, and I'll soon add it to the for-next
branch, after some testing, with that small change if you're ok with
it.

Reviewed-by: Filipe Manana <fdmanana@suse.com>

Thanks.

> +               tmp_locked = true;
> +               btrfs_tree_read_lock(tmp);
>                 btrfs_release_path(p);
> -               return PTR_ERR(tmp);
> +               path_released = true;
> +       }
> +
> +       /* Now we're allowed to do a blocking uptodate check. */
> +       err = btrfs_read_extent_buffer(tmp, &check);
> +       if (err) {
> +               ret = err;
> +               goto out;
>         }
> +
>         /*
>          * If the read above didn't mark this buffer up to date,
>          * it will never end up being up to date.  Set ret to EIO now
>          * and give up so that our caller doesn't loop forever
>          * on our EAGAINs.
>          */
> -       if (!extent_buffer_uptodate(tmp))
> +       if (!extent_buffer_uptodate(tmp)) {
>                 ret = -EIO;
> +               goto out;
> +       }
>
> -out:
>         if (ret == 0) {
> +               ASSERT(!tmp_locked);
>                 *eb_ret = tmp;
> -       } else {
> -               free_extent_buffer(tmp);
> -               btrfs_release_path(p);
> +               tmp = NULL;
> +       }
> +out:
> +       if (tmp) {
> +               if (tmp_locked)
> +                       btrfs_tree_read_unlock(tmp);
> +               if (read_tmp && ret && ret != -EAGAIN)
> +                       free_extent_buffer_stale(tmp);
> +               else
> +                       free_extent_buffer(tmp);
>         }
> +       if (ret && !path_released)
> +               btrfs_release_path(p);
>
>         return ret;
>  }
> @@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> @@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> --
> 2.17.1
>
>
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b14..dc1d4e05214a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1515,12 +1515,14 @@  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	struct btrfs_tree_parent_check check = { 0 };
 	u64 blocknr;
 	u64 gen;
-	struct extent_buffer *tmp;
-	int ret;
+	struct extent_buffer *tmp = NULL;
+	int ret = 0;
 	int parent_level;
-	bool unlock_up;
+	int err;
+	bool read_tmp = false;
+	bool tmp_locked = false;
+	bool path_released = false;
 
-	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
 	parent_level = btrfs_header_level(*eb_ret);
@@ -1551,68 +1553,105 @@  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			 */
 			if (btrfs_verify_level_key(tmp,
 					parent_level - 1, &check.first_key, gen)) {
-				free_extent_buffer(tmp);
-				return -EUCLEAN;
+				ret = -EUCLEAN;
+				goto out;
 			}
 			*eb_ret = tmp;
-			return 0;
+			tmp = NULL;
+			ret = 0;
+			goto out;
 		}
 
 		if (p->nowait) {
-			free_extent_buffer(tmp);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto out;
 		}
 
-		if (unlock_up)
+		if (!p->skip_locking) {
 			btrfs_unlock_up_safe(p, level + 1);
-
-		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_extent_buffer(tmp, &check);
-		if (ret) {
-			free_extent_buffer(tmp);
+			tmp_locked = true;
+			btrfs_tree_read_lock(tmp);
 			btrfs_release_path(p);
-			return ret;
+			ret = -EAGAIN;
+			path_released = true;
 		}
 
-		if (unlock_up)
-			ret = -EAGAIN;
+		/* Now we're allowed to do a blocking uptodate check. */
+		err = btrfs_read_extent_buffer(tmp, &check);
+		if (err) {
+			ret = err;
+			goto out;
+		}
 
+		if (ret == 0) {
+			ASSERT(!tmp_locked);
+			*eb_ret = tmp;
+			tmp = NULL;
+		}
 		goto out;
 	} else if (p->nowait) {
-		return -EAGAIN;
+		ret = -EAGAIN;
+		goto out;
 	}
 
-	if (unlock_up) {
+	if (!p->skip_locking) {
 		btrfs_unlock_up_safe(p, level + 1);
 		ret = -EAGAIN;
-	} else {
-		ret = 0;
 	}
 
 	if (p->reada != READA_NONE)
 		reada_for_search(fs_info, p, level, slot, key->objectid);
 
-	tmp = read_tree_block(fs_info, blocknr, &check);
+	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
 	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		tmp = NULL;
+		goto out;
+	}
+	read_tmp = true;
+
+	if (!p->skip_locking) {
+		ASSERT(ret);
+		tmp_locked = true;
+		btrfs_tree_read_lock(tmp);
 		btrfs_release_path(p);
-		return PTR_ERR(tmp);
+		path_released = true;
+	}
+
+	/* Now we're allowed to do a blocking uptodate check. */
+	err = btrfs_read_extent_buffer(tmp, &check);
+	if (err) {
+		ret = err;
+		goto out;
 	}
+
 	/*
 	 * If the read above didn't mark this buffer up to date,
 	 * it will never end up being up to date.  Set ret to EIO now
 	 * and give up so that our caller doesn't loop forever
 	 * on our EAGAINs.
 	 */
-	if (!extent_buffer_uptodate(tmp))
+	if (!extent_buffer_uptodate(tmp)) {
 		ret = -EIO;
+		goto out;
+	}
 
-out:
 	if (ret == 0) {
+		ASSERT(!tmp_locked);
 		*eb_ret = tmp;
-	} else {
-		free_extent_buffer(tmp);
-		btrfs_release_path(p);
+		tmp = NULL;
+	}
+out:
+	if (tmp) {
+		if (tmp_locked)
+			btrfs_tree_read_unlock(tmp);
+		if (read_tmp && ret && ret != -EAGAIN)
+			free_extent_buffer_stale(tmp);
+		else
+			free_extent_buffer(tmp);
 	}
+	if (ret && !path_released)
+		btrfs_release_path(p);
 
 	return ret;
 }
@@ -2198,7 +2237,7 @@  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;
@@ -2325,7 +2364,7 @@  int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 		}
 
 		err = read_block_for_search(root, p, &b, level, slot, key);
-		if (err == -EAGAIN)
+		if (err == -EAGAIN && !p->nowait)
 			goto again;
 		if (err) {
 			ret = err;