diff mbox series

[v15,40/42] btrfs: zoned: serialize log transaction on zoned filesystems

Message ID 5eabc4600691c618f34f8f39c156d9c094f2687b.1612434091.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Feb. 4, 2021, 10:22 a.m. UTC
This is the 2/3 patch to enable tree-log on zoned filesystems.

Since we can start more than one log transactions per subvolume
simultaneously, nodes from multiple transactions can be allocated
interleaved. Such mixed allocation results in non-sequential writes at the
time of a log transaction commit. The nodes of the global log root tree
(fs_info->log_root_tree), also have the same problem with mixed
allocation.

Serializes log transactions by waiting for a committing transaction when
someone tries to start a new transaction, to avoid the mixed allocation
problem. We must also wait for running log transactions from another
subvolume, but there is no easy way to detect which subvolume root is
running a log transaction. So, this patch forbids starting a new log
transaction when other subvolumes already allocated the global log root
tree.

Cc: Filipe Manana <fdmanana@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/tree-log.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

Comments

Filipe Manana Feb. 4, 2021, 11:50 a.m. UTC | #1
On Thu, Feb 4, 2021 at 10:23 AM Naohiro Aota <naohiro.aota@wdc.com> wrote:
>
> This is the 2/3 patch to enable tree-log on zoned filesystems.
>
> Since we can start more than one log transactions per subvolume
> simultaneously, nodes from multiple transactions can be allocated
> interleaved. Such mixed allocation results in non-sequential writes at the
> time of a log transaction commit. The nodes of the global log root tree
> (fs_info->log_root_tree), also have the same problem with mixed
> allocation.
>
> Serializes log transactions by waiting for a committing transaction when
> someone tries to start a new transaction, to avoid the mixed allocation
> problem. We must also wait for running log transactions from another
> subvolume, but there is no easy way to detect which subvolume root is
> running a log transaction. So, this patch forbids starting a new log
> transaction when other subvolumes already allocated the global log root
> tree.
>
> Cc: Filipe Manana <fdmanana@gmail.com>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/tree-log.c | 29 +++++++++++++++++++++++++++++
>  1 file changed, 29 insertions(+)
>
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index c02eeeac439c..8be3164d4c5d 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
>                                        struct btrfs_root *log,
>                                        struct btrfs_path *path,
>                                        u64 dirid, int del_all);
> +static void wait_log_commit(struct btrfs_root *root, int transid);
>
>  /*
>   * tree logging is a special write ahead log used to make sure that
> @@ -140,6 +141,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  {
>         struct btrfs_fs_info *fs_info = root->fs_info;
>         struct btrfs_root *tree_root = fs_info->tree_root;
> +       const bool zoned = btrfs_is_zoned(fs_info);
>         int ret = 0;
>
>         /*
> @@ -160,12 +162,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>
>         mutex_lock(&root->log_mutex);
>
> +again:
>         if (root->log_root) {
> +               int index = (root->log_transid + 1) % 2;
> +
>                 if (btrfs_need_log_full_commit(trans)) {
>                         ret = -EAGAIN;
>                         goto out;
>                 }
>
> +               if (zoned && atomic_read(&root->log_commit[index])) {
> +                       wait_log_commit(root, root->log_transid - 1);
> +                       goto again;
> +               }
> +
>                 if (!root->log_start_pid) {
>                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>                         root->log_start_pid = current->pid;
> @@ -173,6 +183,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>                 }
>         } else {
> +               if (zoned) {
> +                       mutex_lock(&fs_info->tree_log_mutex);
> +                       if (fs_info->log_root_tree)
> +                               ret = -EAGAIN;
> +                       else
> +                               ret = btrfs_init_log_root_tree(trans, fs_info);
> +                       mutex_unlock(&fs_info->tree_log_mutex);
> +               }

So, nothing here changed since v14 - all my comments still apply [1]
This is based on pre-5.10 code and is broken as it is - it results in
every fsync falling back to a transaction commit, defeating the
purpose of all the patches that deal with log trees on zoned
filesystems.

Thanks.

[1] https://lore.kernel.org/linux-btrfs/CAL3q7H5pv416FVwThOHe+M3L5B-z_n6_ZGQQxsUq5vC5fsAoJw@mail.gmail.com/


> +               if (ret)
> +                       goto out;
> +
>                 ret = btrfs_add_log_tree(trans, root);
>                 if (ret)
>                         goto out;
> @@ -201,14 +222,22 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>   */
>  static int join_running_log_trans(struct btrfs_root *root)
>  {
> +       const bool zoned = btrfs_is_zoned(root->fs_info);
>         int ret = -ENOENT;
>
>         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
>                 return ret;
>
>         mutex_lock(&root->log_mutex);
> +again:
>         if (root->log_root) {
> +               int index = (root->log_transid + 1) % 2;
> +
>                 ret = 0;
> +               if (zoned && atomic_read(&root->log_commit[index])) {
> +                       wait_log_commit(root, root->log_transid - 1);
> +                       goto again;
> +               }
>                 atomic_inc(&root->log_writers);
>         }
>         mutex_unlock(&root->log_mutex);
> --
> 2.30.0
>
Naohiro Aota Feb. 5, 2021, 7:21 a.m. UTC | #2
On Thu, Feb 04, 2021 at 11:50:45AM +0000, Filipe Manana wrote:
> On Thu, Feb 4, 2021 at 10:23 AM Naohiro Aota <naohiro.aota@wdc.com> wrote:
> >
> > This is the 2/3 patch to enable tree-log on zoned filesystems.
> >
> > Since we can start more than one log transactions per subvolume
> > simultaneously, nodes from multiple transactions can be allocated
> > interleaved. Such mixed allocation results in non-sequential writes at the
> > time of a log transaction commit. The nodes of the global log root tree
> > (fs_info->log_root_tree), also have the same problem with mixed
> > allocation.
> >
> > Serializes log transactions by waiting for a committing transaction when
> > someone tries to start a new transaction, to avoid the mixed allocation
> > problem. We must also wait for running log transactions from another
> > subvolume, but there is no easy way to detect which subvolume root is
> > running a log transaction. So, this patch forbids starting a new log
> > transaction when other subvolumes already allocated the global log root
> > tree.
> >
> > Cc: Filipe Manana <fdmanana@gmail.com>
> > Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> > ---
> >  fs/btrfs/tree-log.c | 29 +++++++++++++++++++++++++++++
> >  1 file changed, 29 insertions(+)
> >
> > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> > index c02eeeac439c..8be3164d4c5d 100644
> > --- a/fs/btrfs/tree-log.c
> > +++ b/fs/btrfs/tree-log.c
> > @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
> >                                        struct btrfs_root *log,
> >                                        struct btrfs_path *path,
> >                                        u64 dirid, int del_all);
> > +static void wait_log_commit(struct btrfs_root *root, int transid);
> >
> >  /*
> >   * tree logging is a special write ahead log used to make sure that
> > @@ -140,6 +141,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >  {
> >         struct btrfs_fs_info *fs_info = root->fs_info;
> >         struct btrfs_root *tree_root = fs_info->tree_root;
> > +       const bool zoned = btrfs_is_zoned(fs_info);
> >         int ret = 0;
> >
> >         /*
> > @@ -160,12 +162,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >
> >         mutex_lock(&root->log_mutex);
> >
> > +again:
> >         if (root->log_root) {
> > +               int index = (root->log_transid + 1) % 2;
> > +
> >                 if (btrfs_need_log_full_commit(trans)) {
> >                         ret = -EAGAIN;
> >                         goto out;
> >                 }
> >
> > +               if (zoned && atomic_read(&root->log_commit[index])) {
> > +                       wait_log_commit(root, root->log_transid - 1);
> > +                       goto again;
> > +               }
> > +
> >                 if (!root->log_start_pid) {
> >                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
> >                         root->log_start_pid = current->pid;
> > @@ -173,6 +183,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
> >                 }
> >         } else {
> > +               if (zoned) {
> > +                       mutex_lock(&fs_info->tree_log_mutex);
> > +                       if (fs_info->log_root_tree)
> > +                               ret = -EAGAIN;
> > +                       else
> > +                               ret = btrfs_init_log_root_tree(trans, fs_info);
> > +                       mutex_unlock(&fs_info->tree_log_mutex);
> > +               }
> 
> So, nothing here changed since v14 - all my comments still apply [1]
> This is based on pre-5.10 code and is broken as it is - it results in
> every fsync falling back to a transaction commit, defeating the
> purpose of all the patches that deal with log trees on zoned
> filesystems.
> 
> Thanks.
> 
> [1] https://lore.kernel.org/linux-btrfs/CAL3q7H5pv416FVwThOHe+M3L5B-z_n6_ZGQQxsUq5vC5fsAoJw@mail.gmail.com/

Yes...

As noted in the cover letter, there is a fix for this issue
itself. However, the fix revealed other failures in fsync() path.
But, with further investigation, I found the failures are not really
related to zoned fsync() code. So, I will soon post two patches (one
incremental for this one, and one to deal with a regression case)..

> 
> 
> > +               if (ret)
> > +                       goto out;
> > +
> >                 ret = btrfs_add_log_tree(trans, root);
> >                 if (ret)
> >                         goto out;
> > @@ -201,14 +222,22 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >   */
> >  static int join_running_log_trans(struct btrfs_root *root)
> >  {
> > +       const bool zoned = btrfs_is_zoned(root->fs_info);
> >         int ret = -ENOENT;
> >
> >         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
> >                 return ret;
> >
> >         mutex_lock(&root->log_mutex);
> > +again:
> >         if (root->log_root) {
> > +               int index = (root->log_transid + 1) % 2;
> > +
> >                 ret = 0;
> > +               if (zoned && atomic_read(&root->log_commit[index])) {
> > +                       wait_log_commit(root, root->log_transid - 1);
> > +                       goto again;
> > +               }
> >                 atomic_inc(&root->log_writers);
> >         }
> >         mutex_unlock(&root->log_mutex);
> > --
> > 2.30.0
> >
> 
> 
> -- 
> Filipe David Manana,
> 
> “Whether you think you can, or you think you can't — you're right.”
Naohiro Aota Feb. 5, 2021, 9:15 a.m. UTC | #3
David, could you fold the below incremental diff to this patch? Or, I
can send a full replacement patch.

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8be3164d4c5d..4e72794342c0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -143,6 +143,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	const bool zoned = btrfs_is_zoned(fs_info);
 	int ret = 0;
+	bool created = false;
 
 	/*
 	 * First check if the log root tree was already created. If not, create
@@ -152,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		mutex_lock(&tree_root->log_mutex);
 		if (!fs_info->log_root_tree) {
 			ret = btrfs_init_log_root_tree(trans, fs_info);
-			if (!ret)
+			if (!ret) {
 				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+				created = true;
+			}
 		}
 		mutex_unlock(&tree_root->log_mutex);
 		if (ret)
@@ -183,16 +186,16 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		}
 	} else {
-		if (zoned) {
-			mutex_lock(&fs_info->tree_log_mutex);
-			if (fs_info->log_root_tree)
-				ret = -EAGAIN;
-			else
-				ret = btrfs_init_log_root_tree(trans, fs_info);
-			mutex_unlock(&fs_info->tree_log_mutex);
-		}
-		if (ret)
+		/*
+		 * This means fs_info->log_root_tree was already created
+		 * for some other FS trees. Do the full commit not to mix
+		 * nodes from multiple log transactions to do sequential
+		 * writing.
+		 */
+		if (zoned && !created) {
+			ret = -EAGAIN;
 			goto out;
+		}
 
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)


On Thu, Feb 04, 2021 at 07:22:19PM +0900, Naohiro Aota wrote:
> This is the 2/3 patch to enable tree-log on zoned filesystems.
> 
> Since we can start more than one log transactions per subvolume
> simultaneously, nodes from multiple transactions can be allocated
> interleaved. Such mixed allocation results in non-sequential writes at the
> time of a log transaction commit. The nodes of the global log root tree
> (fs_info->log_root_tree), also have the same problem with mixed
> allocation.
> 
> Serializes log transactions by waiting for a committing transaction when
> someone tries to start a new transaction, to avoid the mixed allocation
> problem. We must also wait for running log transactions from another
> subvolume, but there is no easy way to detect which subvolume root is
> running a log transaction. So, this patch forbids starting a new log
> transaction when other subvolumes already allocated the global log root
> tree.
> 
> Cc: Filipe Manana <fdmanana@gmail.com>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>  fs/btrfs/tree-log.c | 29 +++++++++++++++++++++++++++++
>  1 file changed, 29 insertions(+)
> 
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index c02eeeac439c..8be3164d4c5d 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
>  				       struct btrfs_root *log,
>  				       struct btrfs_path *path,
>  				       u64 dirid, int del_all);
> +static void wait_log_commit(struct btrfs_root *root, int transid);
>  
>  /*
>   * tree logging is a special write ahead log used to make sure that
> @@ -140,6 +141,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  {
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	struct btrfs_root *tree_root = fs_info->tree_root;
> +	const bool zoned = btrfs_is_zoned(fs_info);
>  	int ret = 0;
>  
>  	/*
> @@ -160,12 +162,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  
>  	mutex_lock(&root->log_mutex);
>  
> +again:
>  	if (root->log_root) {
> +		int index = (root->log_transid + 1) % 2;
> +
>  		if (btrfs_need_log_full_commit(trans)) {
>  			ret = -EAGAIN;
>  			goto out;
>  		}
>  
> +		if (zoned && atomic_read(&root->log_commit[index])) {
> +			wait_log_commit(root, root->log_transid - 1);
> +			goto again;
> +		}
> +
>  		if (!root->log_start_pid) {
>  			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>  			root->log_start_pid = current->pid;
> @@ -173,6 +183,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>  			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>  		}
>  	} else {
> +		if (zoned) {
> +			mutex_lock(&fs_info->tree_log_mutex);
> +			if (fs_info->log_root_tree)
> +				ret = -EAGAIN;
> +			else
> +				ret = btrfs_init_log_root_tree(trans, fs_info);
> +			mutex_unlock(&fs_info->tree_log_mutex);
> +		}
> +		if (ret)
> +			goto out;
> +
>  		ret = btrfs_add_log_tree(trans, root);
>  		if (ret)
>  			goto out;
> @@ -201,14 +222,22 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>   */
>  static int join_running_log_trans(struct btrfs_root *root)
>  {
> +	const bool zoned = btrfs_is_zoned(root->fs_info);
>  	int ret = -ENOENT;
>  
>  	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
>  		return ret;
>  
>  	mutex_lock(&root->log_mutex);
> +again:
>  	if (root->log_root) {
> +		int index = (root->log_transid + 1) % 2;
> +
>  		ret = 0;
> +		if (zoned && atomic_read(&root->log_commit[index])) {
> +			wait_log_commit(root, root->log_transid - 1);
> +			goto again;
> +		}
>  		atomic_inc(&root->log_writers);
>  	}
>  	mutex_unlock(&root->log_mutex);
> -- 
> 2.30.0
>
Filipe Manana Feb. 5, 2021, 11:21 a.m. UTC | #4
On Fri, Feb 5, 2021 at 9:15 AM Naohiro Aota <naohiro.aota@wdc.com> wrote:
>
> David, could you fold the below incremental diff to this patch? Or, I
> can send a full replacement patch.
>
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 8be3164d4c5d..4e72794342c0 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -143,6 +143,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>         struct btrfs_root *tree_root = fs_info->tree_root;
>         const bool zoned = btrfs_is_zoned(fs_info);
>         int ret = 0;
> +       bool created = false;
>
>         /*
>          * First check if the log root tree was already created. If not, create
> @@ -152,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>                 mutex_lock(&tree_root->log_mutex);
>                 if (!fs_info->log_root_tree) {
>                         ret = btrfs_init_log_root_tree(trans, fs_info);
> -                       if (!ret)
> +                       if (!ret) {
>                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
> +                               created = true;
> +                       }
>                 }
>                 mutex_unlock(&tree_root->log_mutex);
>                 if (ret)
> @@ -183,16 +186,16 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
>                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
>                 }
>         } else {
> -               if (zoned) {
> -                       mutex_lock(&fs_info->tree_log_mutex);
> -                       if (fs_info->log_root_tree)
> -                               ret = -EAGAIN;
> -                       else
> -                               ret = btrfs_init_log_root_tree(trans, fs_info);
> -                       mutex_unlock(&fs_info->tree_log_mutex);
> -               }
> -               if (ret)
> +               /*
> +                * This means fs_info->log_root_tree was already created
> +                * for some other FS trees. Do the full commit not to mix
> +                * nodes from multiple log transactions to do sequential
> +                * writing.
> +                */
> +               if (zoned && !created) {
> +                       ret = -EAGAIN;
>                         goto out;
> +               }
>
>                 ret = btrfs_add_log_tree(trans, root);
>                 if (ret)
>

Ok, with this, it looks good to me and you can have,

Reviewed-by: Filipe Manana <fdmanana@suse.com>

Thanks.

>
> On Thu, Feb 04, 2021 at 07:22:19PM +0900, Naohiro Aota wrote:
> > This is the 2/3 patch to enable tree-log on zoned filesystems.
> >
> > Since we can start more than one log transactions per subvolume
> > simultaneously, nodes from multiple transactions can be allocated
> > interleaved. Such mixed allocation results in non-sequential writes at the
> > time of a log transaction commit. The nodes of the global log root tree
> > (fs_info->log_root_tree), also have the same problem with mixed
> > allocation.
> >
> > Serializes log transactions by waiting for a committing transaction when
> > someone tries to start a new transaction, to avoid the mixed allocation
> > problem. We must also wait for running log transactions from another
> > subvolume, but there is no easy way to detect which subvolume root is
> > running a log transaction. So, this patch forbids starting a new log
> > transaction when other subvolumes already allocated the global log root
> > tree.
> >
> > Cc: Filipe Manana <fdmanana@gmail.com>
> > Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> > Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> > ---
> >  fs/btrfs/tree-log.c | 29 +++++++++++++++++++++++++++++
> >  1 file changed, 29 insertions(+)
> >
> > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> > index c02eeeac439c..8be3164d4c5d 100644
> > --- a/fs/btrfs/tree-log.c
> > +++ b/fs/btrfs/tree-log.c
> > @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
> >                                      struct btrfs_root *log,
> >                                      struct btrfs_path *path,
> >                                      u64 dirid, int del_all);
> > +static void wait_log_commit(struct btrfs_root *root, int transid);
> >
> >  /*
> >   * tree logging is a special write ahead log used to make sure that
> > @@ -140,6 +141,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >  {
> >       struct btrfs_fs_info *fs_info = root->fs_info;
> >       struct btrfs_root *tree_root = fs_info->tree_root;
> > +     const bool zoned = btrfs_is_zoned(fs_info);
> >       int ret = 0;
> >
> >       /*
> > @@ -160,12 +162,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >
> >       mutex_lock(&root->log_mutex);
> >
> > +again:
> >       if (root->log_root) {
> > +             int index = (root->log_transid + 1) % 2;
> > +
> >               if (btrfs_need_log_full_commit(trans)) {
> >                       ret = -EAGAIN;
> >                       goto out;
> >               }
> >
> > +             if (zoned && atomic_read(&root->log_commit[index])) {
> > +                     wait_log_commit(root, root->log_transid - 1);
> > +                     goto again;
> > +             }
> > +
> >               if (!root->log_start_pid) {
> >                       clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
> >                       root->log_start_pid = current->pid;
> > @@ -173,6 +183,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >                       set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
> >               }
> >       } else {
> > +             if (zoned) {
> > +                     mutex_lock(&fs_info->tree_log_mutex);
> > +                     if (fs_info->log_root_tree)
> > +                             ret = -EAGAIN;
> > +                     else
> > +                             ret = btrfs_init_log_root_tree(trans, fs_info);
> > +                     mutex_unlock(&fs_info->tree_log_mutex);
> > +             }
> > +             if (ret)
> > +                     goto out;
> > +
> >               ret = btrfs_add_log_tree(trans, root);
> >               if (ret)
> >                       goto out;
> > @@ -201,14 +222,22 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
> >   */
> >  static int join_running_log_trans(struct btrfs_root *root)
> >  {
> > +     const bool zoned = btrfs_is_zoned(root->fs_info);
> >       int ret = -ENOENT;
> >
> >       if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
> >               return ret;
> >
> >       mutex_lock(&root->log_mutex);
> > +again:
> >       if (root->log_root) {
> > +             int index = (root->log_transid + 1) % 2;
> > +
> >               ret = 0;
> > +             if (zoned && atomic_read(&root->log_commit[index])) {
> > +                     wait_log_commit(root, root->log_transid - 1);
> > +                     goto again;
> > +             }
> >               atomic_inc(&root->log_writers);
> >       }
> >       mutex_unlock(&root->log_mutex);
> > --
> > 2.30.0
> >
David Sterba Feb. 9, 2021, 1:49 a.m. UTC | #5
On Fri, Feb 05, 2021 at 06:15:16PM +0900, Naohiro Aota wrote:
> David, could you fold the below incremental diff to this patch? Or, I
> can send a full replacement patch.

Folded to the patch, thanks.
diff mbox series

Patch

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c02eeeac439c..8be3164d4c5d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -105,6 +105,7 @@  static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
 				       u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -140,6 +141,7 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+	const bool zoned = btrfs_is_zoned(fs_info);
 	int ret = 0;
 
 	/*
@@ -160,12 +162,20 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->log_mutex);
 
+again:
 	if (root->log_root) {
+		int index = (root->log_transid + 1) % 2;
+
 		if (btrfs_need_log_full_commit(trans)) {
 			ret = -EAGAIN;
 			goto out;
 		}
 
+		if (zoned && atomic_read(&root->log_commit[index])) {
+			wait_log_commit(root, root->log_transid - 1);
+			goto again;
+		}
+
 		if (!root->log_start_pid) {
 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 			root->log_start_pid = current->pid;
@@ -173,6 +183,17 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 		}
 	} else {
+		if (zoned) {
+			mutex_lock(&fs_info->tree_log_mutex);
+			if (fs_info->log_root_tree)
+				ret = -EAGAIN;
+			else
+				ret = btrfs_init_log_root_tree(trans, fs_info);
+			mutex_unlock(&fs_info->tree_log_mutex);
+		}
+		if (ret)
+			goto out;
+
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
 			goto out;
@@ -201,14 +222,22 @@  static int start_log_trans(struct btrfs_trans_handle *trans,
  */
 static int join_running_log_trans(struct btrfs_root *root)
 {
+	const bool zoned = btrfs_is_zoned(root->fs_info);
 	int ret = -ENOENT;
 
 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 		return ret;
 
 	mutex_lock(&root->log_mutex);
+again:
 	if (root->log_root) {
+		int index = (root->log_transid + 1) % 2;
+
 		ret = 0;
+		if (zoned && atomic_read(&root->log_commit[index])) {
+			wait_log_commit(root, root->log_transid - 1);
+			goto again;
+		}
 		atomic_inc(&root->log_writers);
 	}
 	mutex_unlock(&root->log_mutex);