diff mbox series

[v11,39/40] btrfs: serialize log transaction on ZONED mode

Message ID 39b1c016d74422b9dcac01ba6e33d3ccd8000889.1608608848.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Dec. 22, 2020, 3:49 a.m. UTC
This is the 2/3 patch to enable tree-log on ZONED mode.

Since we can start more than one log transactions per subvolume
simultaneously, nodes from multiple transactions can be allocated
interleaved. Such mixed allocation results in non-sequential writes at the
time of log transaction commit. The nodes of the global log root tree
(fs_info->log_root_tree), also have the same mixed allocation problem.

This patch serializes log transactions by waiting for a committing
transaction when someone tries to start a new transaction, to avoid the
mixed allocation problem. We must also wait for running log transactions
from another subvolume, but there is no easy way to detect which subvolume
root is running a log transaction. So, this patch forbids starting a new
log transaction when other subvolumes already allocated the global log root
tree.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/tree-log.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

Comments

Josef Bacik Jan. 12, 2021, 7:50 p.m. UTC | #1
On 12/21/20 10:49 PM, Naohiro Aota wrote:
> This is the 2/3 patch to enable tree-log on ZONED mode.
> 
> Since we can start more than one log transactions per subvolume
> simultaneously, nodes from multiple transactions can be allocated
> interleaved. Such mixed allocation results in non-sequential writes at the
> time of log transaction commit. The nodes of the global log root tree
> (fs_info->log_root_tree), also have the same mixed allocation problem.
> 
> This patch serializes log transactions by waiting for a committing
> transaction when someone tries to start a new transaction, to avoid the
> mixed allocation problem. We must also wait for running log transactions
> from another subvolume, but there is no easy way to detect which subvolume
> root is running a log transaction. So, this patch forbids starting a new
> log transaction when other subvolumes already allocated the global log root
> tree.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/tree-log.c | 27 +++++++++++++++++++++++++++
>   1 file changed, 27 insertions(+)
> 
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 930e752686b4..d269c9ea8706 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
>   				       struct btrfs_root *log,
>   				       struct btrfs_path *path,
>   				       u64 dirid, int del_all);
> +static void wait_log_commit(struct btrfs_root *root, int transid);
>   
>   /*
>    * tree logging is a special write ahead log used to make sure that
> @@ -140,6 +141,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>   {
>   	struct btrfs_fs_info *fs_info = root->fs_info;
>   	struct btrfs_root *tree_root = fs_info->tree_root;
> +	const bool zoned = btrfs_is_zoned(fs_info);
>   	int ret = 0;
>   
>   	/*
> @@ -160,12 +162,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>   
>   	mutex_lock(&root->log_mutex);
>   
> +again:
>   	if (root->log_root) {
> +		int index = (root->log_transid + 1) % 2;
> +
>   		if (btrfs_need_log_full_commit(trans)) {
>   			ret = -EAGAIN;
>   			goto out;
>   		}
>   
> +		if (zoned && atomic_read(&root->log_commit[index])) {
> +			wait_log_commit(root, root->log_transid - 1);
> +			goto again;
> +		}
> +
>   		if (!root->log_start_pid) {
>   			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>   			root->log_start_pid = current->pid;
> @@ -173,6 +183,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>   			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>   		}
>   	} else {
> +		mutex_lock(&fs_info->tree_log_mutex);
> +		if (zoned && fs_info->log_root_tree)
> +			ret = -EAGAIN;
> +		else if (!fs_info->log_root_tree)
> +			ret = btrfs_init_log_root_tree(trans, fs_info);
> +		mutex_unlock(&fs_info->tree_log_mutex);

You're adding lock contention here in the normal case, I'd rather this be

if (zoned) {
	mutex_lock(&fs_info->tree_log_mutex);
	if (fs_info->log_root_tree)
	etc...

Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 930e752686b4..d269c9ea8706 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -105,6 +105,7 @@  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
 				       u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -140,6 +141,7 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+	const bool zoned = btrfs_is_zoned(fs_info);
 	int ret = 0;
 
 	/*
@@ -160,12 +162,20 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->log_mutex);
 
+again:
 	if (root->log_root) {
+		int index = (root->log_transid + 1) % 2;
+
 		if (btrfs_need_log_full_commit(trans)) {
 			ret = -EAGAIN;
 			goto out;
 		}
 
+		if (zoned && atomic_read(&root->log_commit[index])) {
+			wait_log_commit(root, root->log_transid - 1);
+			goto again;
+		}
+
 		if (!root->log_start_pid) {
 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 			root->log_start_pid = current->pid;
@@ -173,6 +183,15 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		}
 	} else {
+		mutex_lock(&fs_info->tree_log_mutex);
+		if (zoned && fs_info->log_root_tree)
+			ret = -EAGAIN;
+		else if (!fs_info->log_root_tree)
+			ret = btrfs_init_log_root_tree(trans, fs_info);
+		mutex_unlock(&fs_info->tree_log_mutex);
+		if (ret)
+			goto out;
+
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
 			goto out;
@@ -201,14 +220,22 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
  */
 static int join_running_log_trans(struct btrfs_root *root)
 {
+	const bool zoned = btrfs_is_zoned(root->fs_info);
 	int ret = -ENOENT;
 
 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 		return ret;
 
 	mutex_lock(&root->log_mutex);
+again:
 	if (root->log_root) {
+		int index = (root->log_transid + 1) % 2;
+
 		ret = 0;
+		if (zoned && atomic_read(&root->log_commit[index])) {
+			wait_log_commit(root, root->log_transid - 1);
+			goto again;
+		}
 		atomic_inc(&root->log_writers);
 	}
 	mutex_unlock(&root->log_mutex);