diff mbox

Btrfs: track dirty block groups on their own list V2

Message ID 1419014495-11815-1-git-send-email-jbacik@fb.com (mailing list archive)
State Accepted
Headers show

Commit Message

Josef Bacik Dec. 19, 2014, 6:41 p.m. UTC
Currently any time we try to update the block groups on disk we will walk _all_
block groups and check for the ->dirty flag to see if it is set.  This function
can get called several times during a commit.  So if you have several terabytes
of data you will be a very sad panda as we will loop through _all_ of the block
groups several times, which makes the commit take a while which slows down the
rest of the file system operations.

This patch introduces a dirty list for the block groups that we get added to
when we dirty the block group for the first time.  Then we simply update any
block groups that have been dirtied since the last time we called
btrfs_write_dirty_block_groups.  This allows us to clean up how we write the
free space cache out so it is much cleaner.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
V1->V2: Don't unconditionally take the dirty bg list lock in update_block_group,
only do it if our dirty_bg list is empty.

 fs/btrfs/ctree.h            |   5 +-
 fs/btrfs/extent-tree.c      | 169 ++++++++++++++------------------------------
 fs/btrfs/free-space-cache.c |   8 ++-
 fs/btrfs/transaction.c      |  14 ++--
 fs/btrfs/transaction.h      |   2 +
 5 files changed, 74 insertions(+), 124 deletions(-)

Comments

Liu Bo Dec. 22, 2014, 9:06 a.m. UTC | #1
On Fri, Dec 19, 2014 at 01:41:35PM -0500, Josef Bacik wrote:
> Currently any time we try to update the block groups on disk we will walk _all_
> block groups and check for the ->dirty flag to see if it is set.  This function
> can get called several times during a commit.  So if you have several terabytes
> of data you will be a very sad panda as we will loop through _all_ of the block
> groups several times, which makes the commit take a while which slows down the
> rest of the file system operations.
> 
> This patch introduces a dirty list for the block groups that we get added to
> when we dirty the block group for the first time.  Then we simply update any
> block groups that have been dirtied since the last time we called
> btrfs_write_dirty_block_groups.  This allows us to clean up how we write the
> free space cache out so it is much cleaner.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
> V1->V2: Don't unconditionally take the dirty bg list lock in update_block_group,
> only do it if our dirty_bg list is empty.
> 
>  fs/btrfs/ctree.h            |   5 +-
>  fs/btrfs/extent-tree.c      | 169 ++++++++++++++------------------------------
>  fs/btrfs/free-space-cache.c |   8 ++-
>  fs/btrfs/transaction.c      |  14 ++--
>  fs/btrfs/transaction.h      |   2 +
>  5 files changed, 74 insertions(+), 124 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b62315b..e5bc509 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1237,7 +1237,6 @@ enum btrfs_disk_cache_state {
>  	BTRFS_DC_ERROR		= 1,
>  	BTRFS_DC_CLEAR		= 2,
>  	BTRFS_DC_SETUP		= 3,
> -	BTRFS_DC_NEED_WRITE	= 4,
>  };
>  
>  struct btrfs_caching_control {
> @@ -1275,7 +1274,6 @@ struct btrfs_block_group_cache {
>  	unsigned long full_stripe_len;
>  
>  	unsigned int ro:1;
> -	unsigned int dirty:1;
>  	unsigned int iref:1;
>  
>  	int disk_cache_state;
> @@ -1309,6 +1307,9 @@ struct btrfs_block_group_cache {
>  
>  	/* For read-only block groups */
>  	struct list_head ro_list;
> +
> +	/* For dirty block groups */
> +	struct list_head dirty_list;
>  };
>  
>  /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 74eb29d..71a9752 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -74,8 +74,9 @@ enum {
>  	RESERVE_ALLOC_NO_ACCOUNT = 2,
>  };
>  
> -static int update_block_group(struct btrfs_root *root,
> -			      u64 bytenr, u64 num_bytes, int alloc);
> +static int update_block_group(struct btrfs_trans_handle *trans,
> +			      struct btrfs_root *root, u64 bytenr,
> +			      u64 num_bytes, int alloc);
>  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>  				struct btrfs_root *root,
>  				u64 bytenr, u64 num_bytes, u64 parent,
> @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
>  				   struct btrfs_root *root)
>  {
>  	struct btrfs_block_group_cache *cache;
> -	int err = 0;
> +	struct btrfs_transaction *cur_trans = trans->transaction;
> +	int ret = 0;
>  	struct btrfs_path *path;
> -	u64 last = 0;
> +
> +	if (list_empty(&cur_trans->dirty_bgs))
> +		return 0;
>  
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
>  
> -again:
> -	while (1) {
> -		cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -		while (cache) {
> -			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> -				break;
> -			cache = next_block_group(root, cache);
> -		}
> -		if (!cache) {
> -			if (last == 0)
> -				break;
> -			last = 0;
> -			continue;
> -		}
> -		err = cache_save_setup(cache, trans, path);
> -		last = cache->key.objectid + cache->key.offset;
> -		btrfs_put_block_group(cache);
> -	}
> -
> -	while (1) {
> -		if (last == 0) {
> -			err = btrfs_run_delayed_refs(trans, root,
> -						     (unsigned long)-1);
> -			if (err) /* File system offline */
> -				goto out;
> -		}
> -
> -		cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -		while (cache) {
> -			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
> -				btrfs_put_block_group(cache);
> -				goto again;
> -			}
> -
> -			if (cache->dirty)
> -				break;
> -			cache = next_block_group(root, cache);
> -		}
> -		if (!cache) {
> -			if (last == 0)
> -				break;
> -			last = 0;
> -			continue;
> -		}
> -
> -		if (cache->disk_cache_state == BTRFS_DC_SETUP)
> -			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
> -		cache->dirty = 0;
> -		last = cache->key.objectid + cache->key.offset;
> -
> -		err = write_one_cache_group(trans, root, path, cache);
> -		btrfs_put_block_group(cache);
> -		if (err) /* File system offline */
> -			goto out;
> -	}
> -
> -	while (1) {
> -		/*
> -		 * I don't think this is needed since we're just marking our
> -		 * preallocated extent as written, but just in case it can't
> -		 * hurt.
> -		 */
> -		if (last == 0) {
> -			err = btrfs_run_delayed_refs(trans, root,
> -						     (unsigned long)-1);
> -			if (err) /* File system offline */
> -				goto out;
> -		}
> -
> -		cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -		while (cache) {
> -			/*
> -			 * Really this shouldn't happen, but it could if we
> -			 * couldn't write the entire preallocated extent and
> -			 * splitting the extent resulted in a new block.
> -			 */
> -			if (cache->dirty) {
> -				btrfs_put_block_group(cache);
> -				goto again;
> -			}
> -			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> -				break;
> -			cache = next_block_group(root, cache);
> -		}
> -		if (!cache) {
> -			if (last == 0)
> -				break;
> -			last = 0;
> -			continue;
> -		}
> -
> -		err = btrfs_write_out_cache(root, trans, cache, path);
> -
> -		/*
> -		 * If we didn't have an error then the cache state is still
> -		 * NEED_WRITE, so we can set it to WRITTEN.
> -		 */
> -		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> -			cache->disk_cache_state = BTRFS_DC_WRITTEN;
> -		last = cache->key.objectid + cache->key.offset;
> +	/*
> +	 * We don't need the lock here since we are protected by the transaction
> +	 * commit.  We want to do the cache_save_setup first and then run the
> +	 * delayed refs to make sure we have the best chance at doing this all
> +	 * in one shot.
> +	 */
> +	while (!list_empty(&cur_trans->dirty_bgs)) {
> +		cache = list_first_entry(&cur_trans->dirty_bgs,
> +					 struct btrfs_block_group_cache,
> +					 dirty_list);
> +		list_del_init(&cache->dirty_list);
> +		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> +			cache_save_setup(cache, trans, path);
> +		if (!ret)
> +			ret = btrfs_run_delayed_refs(trans, root,
> +						     (unsigned long) -1);
> +		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
> +			btrfs_write_out_cache(root, trans, cache, path);
> +		if (!ret)
> +			ret = write_one_cache_group(trans, root, path, cache);
>  		btrfs_put_block_group(cache);
>  	}
> -out:
>  
>  	btrfs_free_path(path);
> -	return err;
> +	return ret;
>  }
>  
>  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
> @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
>  	btrfs_free_reserved_data_space(inode, num_bytes);
>  }
>  
> -static int update_block_group(struct btrfs_root *root,
> -			      u64 bytenr, u64 num_bytes, int alloc)
> +static int update_block_group(struct btrfs_trans_handle *trans,
> +			      struct btrfs_root *root, u64 bytenr,
> +			      u64 num_bytes, int alloc)
>  {
>  	struct btrfs_block_group_cache *cache = NULL;
>  	struct btrfs_fs_info *info = root->fs_info;
> @@ -5414,6 +5338,16 @@ static int update_block_group(struct btrfs_root *root,
>  		if (!alloc && cache->cached == BTRFS_CACHE_NO)
>  			cache_block_group(cache, 1);
>  
> +		if (list_empty(&cache->dirty_list)) {
> +			spin_lock(&trans->transaction->dirty_bgs_lock);
> +			if (list_empty(&cache->dirty_list)) {
> +				list_add_tail(&cache->dirty_list,
> +					      &trans->transaction->dirty_bgs);
> +				btrfs_get_block_group(cache);
> +			}
> +			spin_unlock(&trans->transaction->dirty_bgs_lock);
> +		}
> +
>  		byte_in_group = bytenr - cache->key.objectid;
>  		WARN_ON(byte_in_group > cache->key.offset);
>  
> @@ -5424,7 +5358,6 @@ static int update_block_group(struct btrfs_root *root,
>  		    cache->disk_cache_state < BTRFS_DC_CLEAR)
>  			cache->disk_cache_state = BTRFS_DC_CLEAR;
>  
> -		cache->dirty = 1;
>  		old_val = btrfs_block_group_used(&cache->item);
>  		num_bytes = min(total, cache->key.offset - byte_in_group);
>  		if (alloc) {
> @@ -6102,7 +6035,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>  			}
>  		}
>  
> -		ret = update_block_group(root, bytenr, num_bytes, 0);
> +		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
>  		if (ret) {
>  			btrfs_abort_transaction(trans, extent_root, ret);
>  			goto out;
> @@ -7062,7 +6995,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
>  	if (ret)
>  		return ret;
>  
> -	ret = update_block_group(root, ins->objectid, ins->offset, 1);
> +	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
>  	if (ret) { /* -ENOENT, logic error */
>  		btrfs_err(fs_info, "update block group failed for %llu %llu",
>  			ins->objectid, ins->offset);
> @@ -7151,7 +7084,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>  			return ret;
>  	}
>  
> -	ret = update_block_group(root, ins->objectid, root->nodesize, 1);
> +	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
> +				 1);
>  	if (ret) { /* -ENOENT, logic error */
>  		btrfs_err(fs_info, "update block group failed for %llu %llu",
>  			ins->objectid, ins->offset);
> @@ -9003,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
>  	INIT_LIST_HEAD(&cache->cluster_list);
>  	INIT_LIST_HEAD(&cache->bg_list);
>  	INIT_LIST_HEAD(&cache->ro_list);
> +	INIT_LIST_HEAD(&cache->dirty_list);
>  	btrfs_init_free_space_ctl(cache);
>  
>  	return cache;
> @@ -9065,9 +9000,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
>  			 * b) Setting 'dirty flag' makes sure that we flush
>  			 *    the new space cache info onto disk.
>  			 */
> -			cache->disk_cache_state = BTRFS_DC_CLEAR;
>  			if (btrfs_test_opt(root, SPACE_CACHE))
> -				cache->dirty = 1;
> +				cache->disk_cache_state = BTRFS_DC_CLEAR;
>  		}
>  
>  		read_extent_buffer(leaf, &cache->item,
> @@ -9427,6 +9361,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  	if (block_group->cached == BTRFS_CACHE_STARTED)
>  		wait_block_group_cache_done(block_group);
>  
> +	spin_lock(&trans->transaction->dirty_bgs_lock);
> +	if (!list_empty(&block_group->dirty_list)) {
> +		list_del_init(&block_group->dirty_list);
> +		btrfs_put_block_group(block_group);
> +	}
> +	spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
>  	btrfs_remove_free_space_cache(block_group);
>  
>  	spin_lock(&block_group->space_info->lock);
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 3384819..e85de8c 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1210,6 +1210,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>  	struct inode *inode;
>  	int ret = 0;
> +	enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
>  
>  	root = root->fs_info->tree_root;
>  
> @@ -1233,9 +1234,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>  				      path, block_group->key.objectid);
>  	if (ret) {
> -		spin_lock(&block_group->lock);
> -		block_group->disk_cache_state = BTRFS_DC_ERROR;
> -		spin_unlock(&block_group->lock);
> +		dcs = BTRFS_DC_ERROR;
>  		ret = 0;
>  #ifdef DEBUG
>  		btrfs_err(root->fs_info,
> @@ -1244,6 +1243,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>  	}
>  
> +	spin_lock(&block_group->lock);
> +	block_group->disk_cache_state = dcs;
> +	spin_unlock(&block_group->lock);
>  	iput(inode);
>  	return ret;
>  }
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 6b1ac53..4b3f999 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -220,6 +220,8 @@ loop:
>  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
>  	INIT_LIST_HEAD(&cur_trans->pending_chunks);
>  	INIT_LIST_HEAD(&cur_trans->switch_commits);
> +	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
> +	spin_lock_init(&cur_trans->dirty_bgs_lock);
>  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
>  	extent_io_tree_init(&cur_trans->dirty_pages,
>  			     fs_info->btree_inode->i_mapping);
> @@ -957,7 +959,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
>  	while (1) {
>  		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
>  		if (old_root_bytenr == root->node->start &&
> -		    old_root_used == btrfs_root_used(&root->root_item))
> +		    old_root_used == btrfs_root_used(&root->root_item) &&
> +		    (!extent_root ||
> +		     list_empty(&trans->transaction->dirty_bgs)))
>  			break;
>  
>  		btrfs_set_root_node(&root->root_item, root->node);
> @@ -976,6 +980,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
>  		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
>  		if (ret)
>  			return ret;
> +		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> +		if (ret)
> +			return ret;

Does the second btrfs_run_delayed_refs() mean to update extent_tree's delayed refs produced by the first one?

Thanks,

-liubo

>  	}
>  
>  	return 0;
> @@ -996,10 +1003,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
>  	struct extent_buffer *eb;
>  	int ret;
>  
> -	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> -	if (ret)
> -		return ret;
> -
>  	eb = btrfs_lock_root_node(fs_info->tree_root);
>  	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
>  			      0, &eb);
> @@ -1897,6 +1900,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  	switch_commit_roots(cur_trans, root->fs_info);
>  
>  	assert_qgroups_uptodate(trans);
> +	ASSERT(list_empty(&cur_trans->dirty_bgs));
>  	update_super_roots(root);
>  
>  	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index d8f40e1..2aa345e 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -57,6 +57,8 @@ struct btrfs_transaction {
>  	struct list_head pending_snapshots;
>  	struct list_head pending_chunks;
>  	struct list_head switch_commits;
> +	struct list_head dirty_bgs;
> +	spinlock_t dirty_bgs_lock;
>  	struct btrfs_delayed_ref_root delayed_refs;
>  	int aborted;
>  };
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b62315b..e5bc509 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1237,7 +1237,6 @@  enum btrfs_disk_cache_state {
 	BTRFS_DC_ERROR		= 1,
 	BTRFS_DC_CLEAR		= 2,
 	BTRFS_DC_SETUP		= 3,
-	BTRFS_DC_NEED_WRITE	= 4,
 };
 
 struct btrfs_caching_control {
@@ -1275,7 +1274,6 @@  struct btrfs_block_group_cache {
 	unsigned long full_stripe_len;
 
 	unsigned int ro:1;
-	unsigned int dirty:1;
 	unsigned int iref:1;
 
 	int disk_cache_state;
@@ -1309,6 +1307,9 @@  struct btrfs_block_group_cache {
 
 	/* For read-only block groups */
 	struct list_head ro_list;
+
+	/* For dirty block groups */
+	struct list_head dirty_list;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 74eb29d..71a9752 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@  enum {
 	RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_root *root,
-			      u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -3315,120 +3316,42 @@  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
 	struct btrfs_block_group_cache *cache;
-	int err = 0;
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int ret = 0;
 	struct btrfs_path *path;
-	u64 last = 0;
+
+	if (list_empty(&cur_trans->dirty_bgs))
+		return 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-again:
-	while (1) {
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		while (cache) {
-			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-				break;
-			cache = next_block_group(root, cache);
-		}
-		if (!cache) {
-			if (last == 0)
-				break;
-			last = 0;
-			continue;
-		}
-		err = cache_save_setup(cache, trans, path);
-		last = cache->key.objectid + cache->key.offset;
-		btrfs_put_block_group(cache);
-	}
-
-	while (1) {
-		if (last == 0) {
-			err = btrfs_run_delayed_refs(trans, root,
-						     (unsigned long)-1);
-			if (err) /* File system offline */
-				goto out;
-		}
-
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		while (cache) {
-			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
-				btrfs_put_block_group(cache);
-				goto again;
-			}
-
-			if (cache->dirty)
-				break;
-			cache = next_block_group(root, cache);
-		}
-		if (!cache) {
-			if (last == 0)
-				break;
-			last = 0;
-			continue;
-		}
-
-		if (cache->disk_cache_state == BTRFS_DC_SETUP)
-			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
-		cache->dirty = 0;
-		last = cache->key.objectid + cache->key.offset;
-
-		err = write_one_cache_group(trans, root, path, cache);
-		btrfs_put_block_group(cache);
-		if (err) /* File system offline */
-			goto out;
-	}
-
-	while (1) {
-		/*
-		 * I don't think this is needed since we're just marking our
-		 * preallocated extent as written, but just in case it can't
-		 * hurt.
-		 */
-		if (last == 0) {
-			err = btrfs_run_delayed_refs(trans, root,
-						     (unsigned long)-1);
-			if (err) /* File system offline */
-				goto out;
-		}
-
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		while (cache) {
-			/*
-			 * Really this shouldn't happen, but it could if we
-			 * couldn't write the entire preallocated extent and
-			 * splitting the extent resulted in a new block.
-			 */
-			if (cache->dirty) {
-				btrfs_put_block_group(cache);
-				goto again;
-			}
-			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-				break;
-			cache = next_block_group(root, cache);
-		}
-		if (!cache) {
-			if (last == 0)
-				break;
-			last = 0;
-			continue;
-		}
-
-		err = btrfs_write_out_cache(root, trans, cache, path);
-
-		/*
-		 * If we didn't have an error then the cache state is still
-		 * NEED_WRITE, so we can set it to WRITTEN.
-		 */
-		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-			cache->disk_cache_state = BTRFS_DC_WRITTEN;
-		last = cache->key.objectid + cache->key.offset;
+	/*
+	 * We don't need the lock here since we are protected by the transaction
+	 * commit.  We want to do the cache_save_setup first and then run the
+	 * delayed refs to make sure we have the best chance at doing this all
+	 * in one shot.
+	 */
+	while (!list_empty(&cur_trans->dirty_bgs)) {
+		cache = list_first_entry(&cur_trans->dirty_bgs,
+					 struct btrfs_block_group_cache,
+					 dirty_list);
+		list_del_init(&cache->dirty_list);
+		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+			cache_save_setup(cache, trans, path);
+		if (!ret)
+			ret = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long) -1);
+		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+			btrfs_write_out_cache(root, trans, cache, path);
+		if (!ret)
+			ret = write_one_cache_group(trans, root, path, cache);
 		btrfs_put_block_group(cache);
 	}
-out:
 
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5375,8 +5298,9 @@  void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 	btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_root *root,
-			      u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 bytenr,
+			      u64 num_bytes, int alloc)
 {
 	struct btrfs_block_group_cache *cache = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5338,16 @@  static int update_block_group(struct btrfs_root *root,
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
 			cache_block_group(cache, 1);
 
+		if (list_empty(&cache->dirty_list)) {
+			spin_lock(&trans->transaction->dirty_bgs_lock);
+			if (list_empty(&cache->dirty_list)) {
+				list_add_tail(&cache->dirty_list,
+					      &trans->transaction->dirty_bgs);
+				btrfs_get_block_group(cache);
+			}
+			spin_unlock(&trans->transaction->dirty_bgs_lock);
+		}
+
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5424,7 +5358,6 @@  static int update_block_group(struct btrfs_root *root,
 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
 
-		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
@@ -6102,7 +6035,7 @@  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		ret = update_block_group(root, bytenr, num_bytes, 0);
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
 		if (ret) {
 			btrfs_abort_transaction(trans, extent_root, ret);
 			goto out;
@@ -7062,7 +6995,7 @@  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	ret = update_block_group(root, ins->objectid, ins->offset, 1);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
 			ins->objectid, ins->offset);
@@ -7151,7 +7084,8 @@  static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 			return ret;
 	}
 
-	ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+				 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
 			ins->objectid, ins->offset);
@@ -9003,6 +8937,7 @@  btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	INIT_LIST_HEAD(&cache->cluster_list);
 	INIT_LIST_HEAD(&cache->bg_list);
 	INIT_LIST_HEAD(&cache->ro_list);
+	INIT_LIST_HEAD(&cache->dirty_list);
 	btrfs_init_free_space_ctl(cache);
 
 	return cache;
@@ -9065,9 +9000,8 @@  int btrfs_read_block_groups(struct btrfs_root *root)
 			 * b) Setting 'dirty flag' makes sure that we flush
 			 *    the new space cache info onto disk.
 			 */
-			cache->disk_cache_state = BTRFS_DC_CLEAR;
 			if (btrfs_test_opt(root, SPACE_CACHE))
-				cache->dirty = 1;
+				cache->disk_cache_state = BTRFS_DC_CLEAR;
 		}
 
 		read_extent_buffer(leaf, &cache->item,
@@ -9427,6 +9361,13 @@  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	if (block_group->cached == BTRFS_CACHE_STARTED)
 		wait_block_group_cache_done(block_group);
 
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (!list_empty(&block_group->dirty_list)) {
+		list_del_init(&block_group->dirty_list);
+		btrfs_put_block_group(block_group);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
+
 	btrfs_remove_free_space_cache(block_group);
 
 	spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3384819..e85de8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1210,6 +1210,7 @@  int btrfs_write_out_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct inode *inode;
 	int ret = 0;
+	enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
 
 	root = root->fs_info->tree_root;
 
@@ -1233,9 +1234,7 @@  int btrfs_write_out_cache(struct btrfs_root *root,
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
-		spin_lock(&block_group->lock);
-		block_group->disk_cache_state = BTRFS_DC_ERROR;
-		spin_unlock(&block_group->lock);
+		dcs = BTRFS_DC_ERROR;
 		ret = 0;
 #ifdef DEBUG
 		btrfs_err(root->fs_info,
@@ -1244,6 +1243,9 @@  int btrfs_write_out_cache(struct btrfs_root *root,
 #endif
 	}
 
+	spin_lock(&block_group->lock);
+	block_group->disk_cache_state = dcs;
+	spin_unlock(&block_group->lock);
 	iput(inode);
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6b1ac53..4b3f999 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,8 @@  loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+	spin_lock_init(&cur_trans->dirty_bgs_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
@@ -957,7 +959,9 @@  static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start &&
-		    old_root_used == btrfs_root_used(&root->root_item))
+		    old_root_used == btrfs_root_used(&root->root_item) &&
+		    (!extent_root ||
+		     list_empty(&trans->transaction->dirty_bgs)))
 			break;
 
 		btrfs_set_root_node(&root->root_item, root->node);
@@ -976,6 +980,9 @@  static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 		if (ret)
 			return ret;
+		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
@@ -996,10 +1003,6 @@  static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb;
 	int ret;
 
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	if (ret)
-		return ret;
-
 	eb = btrfs_lock_root_node(fs_info->tree_root);
 	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
 			      0, &eb);
@@ -1897,6 +1900,7 @@  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	switch_commit_roots(cur_trans, root->fs_info);
 
 	assert_qgroups_uptodate(trans);
+	ASSERT(list_empty(&cur_trans->dirty_bgs));
 	update_super_roots(root);
 
 	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1..2aa345e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,8 @@  struct btrfs_transaction {
 	struct list_head pending_snapshots;
 	struct list_head pending_chunks;
 	struct list_head switch_commits;
+	struct list_head dirty_bgs;
+	spinlock_t dirty_bgs_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
 };