diff mbox series

[v14,42/42] btrfs: reorder log node allocation

Message ID 246db67fcf56240127a252f09742684cd30f4cfe.1611627788.git.naohiro.aota@wdc.com (mailing list archive)
State New
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Jan. 26, 2021, 2:25 a.m. UTC
This is the 3/3 patch to enable tree-log on ZONED mode.

The allocation order of nodes of "fs_info->log_root_tree" and nodes of
"root->log_root" is not the same as the writing order of them. So, the
writing causes unaligned write errors.

This patch reorders the allocation of them by delaying allocation of the
root node of "fs_info->log_root_tree," so that the node buffers can go out
sequentially to devices.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/disk-io.c  |  7 -------
 fs/btrfs/tree-log.c | 24 ++++++++++++++++++------
 2 files changed, 18 insertions(+), 13 deletions(-)

Comments

Filipe Manana Feb. 1, 2021, 3:48 p.m. UTC | #1
On Wed, Jan 27, 2021 at 6:48 PM Naohiro Aota <naohiro.aota@wdc.com> wrote:
>
> This is the 3/3 patch to enable tree-log on ZONED mode.
>
> The allocation order of nodes of "fs_info->log_root_tree" and nodes of
> "root->log_root" is not the same as the writing order of them. So, the
> writing causes unaligned write errors.
>
> This patch reorders the allocation of them by delaying allocation of the
> root node of "fs_info->log_root_tree," so that the node buffers can go out
> sequentially to devices.
>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/disk-io.c  |  7 -------
>  fs/btrfs/tree-log.c | 24 ++++++++++++++++++------
>  2 files changed, 18 insertions(+), 13 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index c3b5cfe4d928..d2b30716de84 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1241,18 +1241,11 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
>                              struct btrfs_fs_info *fs_info)
>  {
>         struct btrfs_root *log_root;
> -       int ret;
>
>         log_root = alloc_log_tree(trans, fs_info);
>         if (IS_ERR(log_root))
>                 return PTR_ERR(log_root);
>
> -       ret = btrfs_alloc_log_tree_node(trans, log_root);
> -       if (ret) {
> -               btrfs_put_root(log_root);
> -               return ret;
> -       }
> -
>         WARN_ON(fs_info->log_root_tree);
>         fs_info->log_root_tree = log_root;
>         return 0;
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 71a1c0b5bc26..d8315363dc1e 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -3159,6 +3159,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
>         root_log_ctx.log_transid = log_root_tree->log_transid;
>
> +       mutex_lock(&fs_info->tree_log_mutex);
> +       if (!log_root_tree->node) {
> +               ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
> +               if (ret) {
> +                       mutex_unlock(&fs_info->tree_log_mutex);
> +                       goto out;
> +               }
> +       }
> +       mutex_unlock(&fs_info->tree_log_mutex);

Hum, so this now has an impact for non-zoned mode.

It reduces the parallelism between a previous transaction finishing
its commit and an fsync started in the current transaction.

A transaction commit releases fs_info->tree_log_mutex after it commits
the super block.

By taking that mutex here, we wait for the transaction commit to write
the super blocks before we update the log root, start writeback of log
tree nodes and wait for the writeback to complete.

Before this change, we would do those 3 things before waiting for the
previous transaction to commit the super blocks.

So this undoes part of what was made by the following commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47876f7ceffa0e6af7476e052b3c061f1f2c1d9f

Which landed in 5.10. This patch and the rest of the patchset was
based on pre 5.10 code - was this missed because of that?
Or is there some other reason to have to do things this way for non-zoned mode?

I think we should preserve the behaviour for non-zoned mode - i.e. any
reason why not allocating log_root_tree->node at the top of start
log_trans(), while under the protection of tree_root->log_mutex?

My impression is that this, and the other patch with the subject
"btrfs: serialize log transaction on ZONED mode", are out of sync with
the changes in 5.10.

Thanks, sorry for the late review.


> +
>         /*
>          * Now we are safe to update the log_root_tree because we're under the
>          * log_mutex, and we're a current writer so we're holding the commit
> @@ -3317,12 +3327,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
>                 .process_func = process_one_buffer
>         };
>
> -       ret = walk_log_tree(trans, log, &wc);
> -       if (ret) {
> -               if (trans)
> -                       btrfs_abort_transaction(trans, ret);
> -               else
> -                       btrfs_handle_fs_error(log->fs_info, ret, NULL);
> +       if (log->node) {
> +               ret = walk_log_tree(trans, log, &wc);
> +               if (ret) {
> +                       if (trans)
> +                               btrfs_abort_transaction(trans, ret);
> +                       else
> +                               btrfs_handle_fs_error(log->fs_info, ret, NULL);
> +               }
>         }
>
>         clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
> --
> 2.27.0
>
Johannes Thumshirn Feb. 1, 2021, 3:54 p.m. UTC | #2
On 01/02/2021 16:49, Filipe Manana wrote:
> On Wed, Jan 27, 2021 at 6:48 PM Naohiro Aota <naohiro.aota@wdc.com> wrote:
>>
>> This is the 3/3 patch to enable tree-log on ZONED mode.
>>
>> The allocation order of nodes of "fs_info->log_root_tree" and nodes of
>> "root->log_root" is not the same as the writing order of them. So, the
>> writing causes unaligned write errors.
>>
>> This patch reorders the allocation of them by delaying allocation of the
>> root node of "fs_info->log_root_tree," so that the node buffers can go out
>> sequentially to devices.
>>
>> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>> ---
>>  fs/btrfs/disk-io.c  |  7 -------
>>  fs/btrfs/tree-log.c | 24 ++++++++++++++++++------
>>  2 files changed, 18 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index c3b5cfe4d928..d2b30716de84 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1241,18 +1241,11 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
>>                              struct btrfs_fs_info *fs_info)
>>  {
>>         struct btrfs_root *log_root;
>> -       int ret;
>>
>>         log_root = alloc_log_tree(trans, fs_info);
>>         if (IS_ERR(log_root))
>>                 return PTR_ERR(log_root);
>>
>> -       ret = btrfs_alloc_log_tree_node(trans, log_root);
>> -       if (ret) {
>> -               btrfs_put_root(log_root);
>> -               return ret;
>> -       }
>> -
>>         WARN_ON(fs_info->log_root_tree);
>>         fs_info->log_root_tree = log_root;
>>         return 0;
>> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
>> index 71a1c0b5bc26..d8315363dc1e 100644
>> --- a/fs/btrfs/tree-log.c
>> +++ b/fs/btrfs/tree-log.c
>> @@ -3159,6 +3159,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>>         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
>>         root_log_ctx.log_transid = log_root_tree->log_transid;
>>
>> +       mutex_lock(&fs_info->tree_log_mutex);
>> +       if (!log_root_tree->node) {
>> +               ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
>> +               if (ret) {
>> +                       mutex_unlock(&fs_info->tree_log_mutex);
>> +                       goto out;
>> +               }
>> +       }
>> +       mutex_unlock(&fs_info->tree_log_mutex);
> 
> Hum, so this now has an impact for non-zoned mode.
> 
> It reduces the parallelism between a previous transaction finishing
> its commit and an fsync started in the current transaction.
> 
> A transaction commit releases fs_info->tree_log_mutex after it commits
> the super block.
> 
> By taking that mutex here, we wait for the transaction commit to write
> the super blocks before we update the log root, start writeback of log
> tree nodes and wait for the writeback to complete.
> 
> Before this change, we would do those 3 things before waiting for the
> previous transaction to commit the super blocks.
> 
> So this undoes part of what was made by the following commit:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47876f7ceffa0e6af7476e052b3c061f1f2c1d9f
> 
> Which landed in 5.10. This patch and the rest of the patchset was
> based on pre 5.10 code - was this missed because of that?
> Or is there some other reason to have to do things this way for non-zoned mode?
> 
> I think we should preserve the behaviour for non-zoned mode - i.e. any
> reason why not allocating log_root_tree->node at the top of start
> log_trans(), while under the protection of tree_root->log_mutex?
> 
> My impression is that this, and the other patch with the subject
> "btrfs: serialize log transaction on ZONED mode", are out of sync with
> the changes in 5.10.
> 
> Thanks, sorry for the late review.

Yes this code has been under development for quite some time and we probably 
missed this upstream update.

We will be sending a fix ASAP.
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c3b5cfe4d928..d2b30716de84 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1241,18 +1241,11 @@  int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *log_root;
-	int ret;
 
 	log_root = alloc_log_tree(trans, fs_info);
 	if (IS_ERR(log_root))
 		return PTR_ERR(log_root);
 
-	ret = btrfs_alloc_log_tree_node(trans, log_root);
-	if (ret) {
-		btrfs_put_root(log_root);
-		return ret;
-	}
-
 	WARN_ON(fs_info->log_root_tree);
 	fs_info->log_root_tree = log_root;
 	return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 71a1c0b5bc26..d8315363dc1e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3159,6 +3159,16 @@  int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
 	root_log_ctx.log_transid = log_root_tree->log_transid;
 
+	mutex_lock(&fs_info->tree_log_mutex);
+	if (!log_root_tree->node) {
+		ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+		if (ret) {
+			mutex_unlock(&fs_info->tree_log_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&fs_info->tree_log_mutex);
+
 	/*
 	 * Now we are safe to update the log_root_tree because we're under the
 	 * log_mutex, and we're a current writer so we're holding the commit
@@ -3317,12 +3327,14 @@  static void free_log_tree(struct btrfs_trans_handle *trans,
 		.process_func = process_one_buffer
 	};
 
-	ret = walk_log_tree(trans, log, &wc);
-	if (ret) {
-		if (trans)
-			btrfs_abort_transaction(trans, ret);
-		else
-			btrfs_handle_fs_error(log->fs_info, ret, NULL);
+	if (log->node) {
+		ret = walk_log_tree(trans, log, &wc);
+		if (ret) {
+			if (trans)
+				btrfs_abort_transaction(trans, ret);
+			else
+				btrfs_handle_fs_error(log->fs_info, ret, NULL);
+		}
 	}
 
 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,