diff mbox series

[18/42] btrfs: move the dio_sem higher up the callchain

Message ID 20180928111821.24376-19-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series My current patch queue | expand

Commit Message

Josef Bacik Sept. 28, 2018, 11:17 a.m. UTC
We're getting a lockdep splat because we take the dio_sem under the
log_mutex.  What we really need is to protect fsync() from logging an
extent map for an extent we never waited on higher up, so just guard the
whole thing with dio_sem.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/file.c     | 12 ++++++++++++
 fs/btrfs/tree-log.c |  2 --
 2 files changed, 12 insertions(+), 2 deletions(-)

Comments

David Sterba Oct. 3, 2018, 12:27 p.m. UTC | #1
On Fri, Sep 28, 2018 at 07:17:57AM -0400, Josef Bacik wrote:
> We're getting a lockdep splat because we take the dio_sem under the

Can you please add the important bits of the lockdep warning to the
changelog? And possibly reference to the test or workload that triggers
that.

> log_mutex.  What we really need is to protect fsync() from logging an
> extent map for an extent we never waited on higher up, so just guard the
> whole thing with dio_sem.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/file.c     | 12 ++++++++++++
>  fs/btrfs/tree-log.c |  2 --
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 095f0bb86bb7..c07110edb9de 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2079,6 +2079,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  		goto out;
>  
>  	inode_lock(inode);
> +
> +	/*
> +	 * We take the dio_sem here because the tree log stuff can race with
> +	 * lockless dio writes and get an extent map logged for an extent we
> +	 * never waited on.  We need it this high up for lockdep reasons.
> +	 */
> +	down_write(&BTRFS_I(inode)->dio_sem);
> +
>  	atomic_inc(&root->log_batch);
>  
>  	/*
> @@ -2087,6 +2095,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  	 */
>  	ret = btrfs_wait_ordered_range(inode, start, len);
>  	if (ret) {
> +		up_write(&BTRFS_I(inode)->dio_sem);
>  		inode_unlock(inode);
>  		goto out;
>  	}
> @@ -2110,6 +2119,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  		 * checked called fsync.
>  		 */
>  		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
> +		up_write(&BTRFS_I(inode)->dio_sem);
>  		inode_unlock(inode);
>  		goto out;
>  	}
> @@ -2128,6 +2138,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  	trans = btrfs_start_transaction(root, 0);
>  	if (IS_ERR(trans)) {
>  		ret = PTR_ERR(trans);
> +		up_write(&BTRFS_I(inode)->dio_sem);
>  		inode_unlock(inode);
>  		goto out;
>  	}
> @@ -2149,6 +2160,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  	 * file again, but that will end up using the synchronization
>  	 * inside btrfs_sync_log to keep things safe.
>  	 */
> +	up_write(&BTRFS_I(inode)->dio_sem);
>  	inode_unlock(inode);
>  
>  	/*
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 1650dc44a5e3..66b7e059b765 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -4374,7 +4374,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>  
>  	INIT_LIST_HEAD(&extents);
>  
> -	down_write(&inode->dio_sem);
>  	write_lock(&tree->lock);
>  	test_gen = root->fs_info->last_trans_committed;
>  	logged_start = start;
> @@ -4440,7 +4439,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>  	}
>  	WARN_ON(!list_empty(&extents));
>  	write_unlock(&tree->lock);
> -	up_write(&inode->dio_sem);
>  
>  	btrfs_release_path(path);
>  	if (!ret)
> -- 
> 2.14.3
Filipe Manana Oct. 3, 2018, 2:54 p.m. UTC | #2
On Fri, Sep 28, 2018 at 12:19 PM Josef Bacik <josef@toxicpanda.com> wrote:
>
> We're getting a lockdep splat because we take the dio_sem under the
> log_mutex.  What we really need is to protect fsync() from logging an
> extent map for an extent we never waited on higher up, so just guard the
> whole thing with dio_sem.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>

Looks good, thanks. However as David said, it would be nice to have a
sample trace pasted in the changelog (several fstests test cases
trigger this often).


> ---
>  fs/btrfs/file.c     | 12 ++++++++++++
>  fs/btrfs/tree-log.c |  2 --
>  2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 095f0bb86bb7..c07110edb9de 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2079,6 +2079,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>                 goto out;
>
>         inode_lock(inode);
> +
> +       /*
> +        * We take the dio_sem here because the tree log stuff can race with
> +        * lockless dio writes and get an extent map logged for an extent we
> +        * never waited on.  We need it this high up for lockdep reasons.
> +        */
> +       down_write(&BTRFS_I(inode)->dio_sem);
> +
>         atomic_inc(&root->log_batch);
>
>         /*
> @@ -2087,6 +2095,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>          */
>         ret = btrfs_wait_ordered_range(inode, start, len);
>         if (ret) {
> +               up_write(&BTRFS_I(inode)->dio_sem);
>                 inode_unlock(inode);
>                 goto out;
>         }
> @@ -2110,6 +2119,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>                  * checked called fsync.
>                  */
>                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
> +               up_write(&BTRFS_I(inode)->dio_sem);
>                 inode_unlock(inode);
>                 goto out;
>         }
> @@ -2128,6 +2138,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>         trans = btrfs_start_transaction(root, 0);
>         if (IS_ERR(trans)) {
>                 ret = PTR_ERR(trans);
> +               up_write(&BTRFS_I(inode)->dio_sem);
>                 inode_unlock(inode);
>                 goto out;
>         }
> @@ -2149,6 +2160,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>          * file again, but that will end up using the synchronization
>          * inside btrfs_sync_log to keep things safe.
>          */
> +       up_write(&BTRFS_I(inode)->dio_sem);
>         inode_unlock(inode);
>
>         /*
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 1650dc44a5e3..66b7e059b765 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -4374,7 +4374,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>
>         INIT_LIST_HEAD(&extents);
>
> -       down_write(&inode->dio_sem);
>         write_lock(&tree->lock);
>         test_gen = root->fs_info->last_trans_committed;
>         logged_start = start;
> @@ -4440,7 +4439,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
>         }
>         WARN_ON(!list_empty(&extents));
>         write_unlock(&tree->lock);
> -       up_write(&inode->dio_sem);
>
>         btrfs_release_path(path);
>         if (!ret)
> --
> 2.14.3
>
diff mbox series

Patch

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 095f0bb86bb7..c07110edb9de 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2079,6 +2079,14 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 
 	inode_lock(inode);
+
+	/*
+	 * We take the dio_sem here because the tree log stuff can race with
+	 * lockless dio writes and get an extent map logged for an extent we
+	 * never waited on.  We need it this high up for lockdep reasons.
+	 */
+	down_write(&BTRFS_I(inode)->dio_sem);
+
 	atomic_inc(&root->log_batch);
 
 	/*
@@ -2087,6 +2095,7 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	ret = btrfs_wait_ordered_range(inode, start, len);
 	if (ret) {
+		up_write(&BTRFS_I(inode)->dio_sem);
 		inode_unlock(inode);
 		goto out;
 	}
@@ -2110,6 +2119,7 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 * checked called fsync.
 		 */
 		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+		up_write(&BTRFS_I(inode)->dio_sem);
 		inode_unlock(inode);
 		goto out;
 	}
@@ -2128,6 +2138,7 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
+		up_write(&BTRFS_I(inode)->dio_sem);
 		inode_unlock(inode);
 		goto out;
 	}
@@ -2149,6 +2160,7 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
+	up_write(&BTRFS_I(inode)->dio_sem);
 	inode_unlock(inode);
 
 	/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1650dc44a5e3..66b7e059b765 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4374,7 +4374,6 @@  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 	INIT_LIST_HEAD(&extents);
 
-	down_write(&inode->dio_sem);
 	write_lock(&tree->lock);
 	test_gen = root->fs_info->last_trans_committed;
 	logged_start = start;
@@ -4440,7 +4439,6 @@  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 	WARN_ON(!list_empty(&extents));
 	write_unlock(&tree->lock);
-	up_write(&inode->dio_sem);
 
 	btrfs_release_path(path);
 	if (!ret)