diff mbox

[v2] btrfs: should block unused block groups deletion work when allocating data space

Message ID 20160829105038.20293-1-wangxg.fnst@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiaoguang Wang Aug. 29, 2016, 10:50 a.m. UTC
cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs()
to delete unused block groups. Because this work is asynchronous, it may also result
in false ENOSPC error. Please see below race window:

               CPU1                           |             CPU2
                                              |
|-> btrfs_alloc_data_chunk_ondemand()         |-> cleaner_kthread()
    |-> do_chunk_alloc()                      |   |
    |   assume it returns ENOSPC, which means |   |
    |   btrfs_space_info is full and have free|   |
    |   space to satisfy data request.        |   |
    |                                         |   |- > btrfs_delete_unused_bgs()
    |                                         |   |    it will decrease btrfs_space_info
    |                                         |   |    total_bytes and make
    |                                         |   |    btrfs_space_info is not full.
    |                                         |   |
In this case, we may get ENOSPC error, but btrfs_space_info is not full.

To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call
do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs().
Here we introduce a new struct rw_semaphore bg_delete_sem to do this job.

Indeed there is already a "struct mutex delete_unused_bgs_mutex", but it's mutex,
we can not use it for this purpose. Of course, we can re-define it to be struct
rw_semaphore, then use it in btrfs_alloc_data_chunk_ondemand(). Either method will
work.

But given that delete_unused_bgs_mutex's name length is longer than bg_delete_sem,
I choose the first method, to create a new struct rw_semaphore bg_delete_sem and
delete delete_unused_bgs_mutex :)

But after introducing this new rw_semaphore bg_delete_sem, there may occur
a deadlock, fstests test case btrfs/071 have revealed this deadlock, please
see call graph below:

        CPU1                                               |     CPU2
|-> btrfs_commit_transaction()                             |-> cleaner_kthread()
    |-> commit_cowonly_roots()                             |   |-> btrfs_delete_unused_bgs()
        |-> btrfs_setup_space_cache()                      |       |-> down_write(bg_delete_sem)
            |-> cache_save_setup()                         |       |
                |-> btrfs_check_data_free_space()          |       |
                    |-> btrfs_alloc_data_chunk_ondemand()  |       |-> btrfs_start_trans_remove_block_group()
                        |-> down_read(bg_delete_sem)       |           |-> wait_current_trans()

Indeed there is a lock order between wait-current-trans-finished and
down_write(bg_delete_sem), so in btrfs_delete_unused_bgs(), we must obey
this lock rule. We call btrfs_start_transaction(root, 0) before
down_write(bg_delete_sem) in btrfs_delete_unused_bgs().

In balance codes, we also down_read(bg_delete_sem) before start_transaction(),
but becasue bg_delete_sem is rw_semaphore, there will be no deadlock.

Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
---
V2: fix a deadlock revealed by fstests case btrfs/071, we call
    start_transaction() before in down_write(bg_delete_sem) in
    btrfs_delete_unused_bgs().
---
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/disk-io.c     |  13 ++---
 fs/btrfs/extent-tree.c | 146 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/volumes.c     |  42 +++++++-------
 4 files changed, 155 insertions(+), 48 deletions(-)
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 09cdff0..a8d6267 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -788,6 +788,7 @@  struct btrfs_fs_info {
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
+	struct rw_semaphore bg_delete_sem;
 
 	/*
 	 * this is taken to make sure we don't set block groups ro after
@@ -1068,7 +1069,6 @@  struct btrfs_fs_info {
 	spinlock_t unused_bgs_lock;
 	struct list_head unused_bgs;
 	struct mutex unused_bg_unpin_mutex;
-	struct mutex delete_unused_bgs_mutex;
 
 	/* For btrfs to record security options */
 	struct security_mnt_opts security_opts;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7857f64..a4cfe99 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1837,12 +1837,11 @@  static int cleaner_kthread(void *arg)
 		btrfs_run_defrag_inodes(root->fs_info);
 
 		/*
-		 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
-		 * with relocation (btrfs_relocate_chunk) and relocation
-		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
-		 * after acquiring fs_info->delete_unused_bgs_mutex. So we
-		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
-		 * unused block groups.
+		 * Acquires fs_info->bg_delete_sem to avoid racing with
+		 * relocation (btrfs_relocate_chunk) and relocation acquires
+		 * fs_info->cleaner_mutex (btrfs_relocate_block_group) after
+		 * acquiring fs_info->bg_delete_sem. So we can't hold, nor need
+		 * to, fs_info->cleaner_mutex when deleting unused block groups.
 		 */
 		btrfs_delete_unused_bgs(root->fs_info);
 sleep:
@@ -2603,7 +2602,6 @@  int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
-	mutex_init(&fs_info->delete_unused_bgs_mutex);
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
 	mutex_init(&fs_info->cleaner_delayed_iput_mutex);
@@ -2691,6 +2689,7 @@  int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	init_rwsem(&fs_info->bg_delete_sem);
 	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
 
 	btrfs_init_dev_replace_locks(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 133c93b..1f8e996 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4128,6 +4128,7 @@  int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 	int ret = 0;
 	int need_commit = 2;
 	int have_pinned_space;
+	int have_bg_delete_sem = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
@@ -4138,8 +4139,11 @@  int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 	}
 
 	data_sinfo = fs_info->data_sinfo;
-	if (!data_sinfo)
+	if (!data_sinfo) {
+		down_read(&root->fs_info->bg_delete_sem);
+		have_bg_delete_sem = 1;
 		goto alloc;
+	}
 
 again:
 	/* make sure we have enough space to handle the data first */
@@ -4152,6 +4156,17 @@  again:
 		struct btrfs_trans_handle *trans;
 
 		/*
+		 * We may need to allocate new chunk, so we should block
+		 * btrfs_delete_unused_bgs()
+		 */
+		if (!have_bg_delete_sem) {
+			spin_unlock(&data_sinfo->lock);
+			down_read(&root->fs_info->bg_delete_sem);
+			have_bg_delete_sem = 1;
+			goto again;
+		}
+
+		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
@@ -4173,17 +4188,20 @@  alloc:
 			 * the fs.
 			 */
 			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
+			if (IS_ERR(trans)) {
+				up_read(&root->fs_info->bg_delete_sem);
 				return PTR_ERR(trans);
+			}
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     alloc_target,
 					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
 			if (ret < 0) {
-				if (ret != -ENOSPC)
+				if (ret != -ENOSPC) {
+					up_read(&root->fs_info->bg_delete_sem);
 					return ret;
-				else {
+				} else {
 					have_pinned_space = 1;
 					goto commit_trans;
 				}
@@ -4217,15 +4235,19 @@  commit_trans:
 			}
 
 			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
+			if (IS_ERR(trans)) {
+				up_read(&root->fs_info->bg_delete_sem);
 				return PTR_ERR(trans);
+			}
 			if (have_pinned_space >= 0 ||
 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 				     &trans->transaction->flags) ||
 			    need_commit > 0) {
 				ret = btrfs_commit_transaction(trans, root);
-				if (ret)
+				if (ret) {
+					up_read(&root->fs_info->bg_delete_sem);
 					return ret;
+				}
 				/*
 				 * The cleaner kthread might still be doing iput
 				 * operations. Wait for it to finish so that
@@ -4242,6 +4264,7 @@  commit_trans:
 		trace_btrfs_space_reservation(root->fs_info,
 					      "space_info:enospc",
 					      data_sinfo->flags, bytes, 1);
+		up_read(&root->fs_info->bg_delete_sem);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -4249,6 +4272,9 @@  commit_trans:
 				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
+	if (have_bg_delete_sem)
+		up_read(&root->fs_info->bg_delete_sem);
+
 	return ret;
 }
 
@@ -10763,6 +10789,65 @@  btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
 							   num_items, 1);
 }
 
+static struct btrfs_block_rsv *
+create_block_rsv_remove_block_group(struct btrfs_root *root,
+				    const u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree = &root->fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	unsigned int num_items;
+	struct btrfs_block_rsv *rsv;
+	int ret;
+	u64 num_bytes;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	read_unlock(&em_tree->lock);
+	ASSERT(em && em->start == chunk_offset);
+
+	/*
+	 * We need to reserve 3 + N units from the metadata space info in order
+	 * to remove a block group (done at btrfs_remove_chunk() and at
+	 * btrfs_remove_block_group()), which are used for:
+	 *
+	 * 1 unit for adding the free space inode's orphan (located in the tree
+	 * of tree roots).
+	 * 1 unit for deleting the block group item (located in the extent
+	 * tree).
+	 * 1 unit for deleting the free space item (located in tree of tree
+	 * roots).
+	 * N units for deleting N device extent items corresponding to each
+	 * stripe (located in the device tree).
+	 *
+	 * In order to remove a block group we also need to reserve units in the
+	 * system space info in order to update the chunk tree (update one or
+	 * more device items and remove one chunk item), but this is done at
+	 * btrfs_remove_chunk() through a call to check_system_chunk().
+	 */
+	map = em->map_lookup;
+	num_items = 3 + map->num_stripes;
+	free_extent_map(em);
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv)
+		return ERR_PTR(-ENOMEM);
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+	if (!ret)
+		return rsv;
+
+	ret = btrfs_cond_migrate_bytes(root->fs_info, rsv, num_bytes, 1);
+	if (ret) {
+		btrfs_free_block_rsv(root, rsv);
+		return ERR_PTR(ret);
+	}
+
+	return rsv;
+}
+
 /*
  * Process the unused_bgs list and remove any that don't have any allocated
  * space inside of them.
@@ -10773,6 +10858,7 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 	struct btrfs_space_info *space_info;
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv *rsv;
 	int ret = 0;
 
 	if (!fs_info->open)
@@ -10796,7 +10882,24 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 		spin_unlock(&fs_info->unused_bgs_lock);
 
-		mutex_lock(&fs_info->delete_unused_bgs_mutex);
+		/*
+		 * there is a lock order between "wait transaction finished" and
+		 * down_write(bg_delete_sem), so here we first create a
+		 * transaction handle with zero bytes reserved. Because the work
+		 * to calculate metadata bytes need to be protected by
+		 * bg_delete_sem, see btrfs_start_trans_remove_block_group()
+		 * about how to calculate metadata bytes, so we do the metadata
+		 * reservation later.
+		 */
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			btrfs_put_block_group(block_group);
+			spin_lock(&fs_info->unused_bgs_lock);
+			continue;
+		}
+
+		down_write(&root->fs_info->bg_delete_sem);
 
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
@@ -10813,7 +10916,7 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 			 */
 			spin_unlock(&block_group->lock);
 			up_write(&space_info->groups_sem);
-			goto next;
+			goto end_trans;
 		}
 		spin_unlock(&block_group->lock);
 
@@ -10822,21 +10925,27 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		up_write(&space_info->groups_sem);
 		if (ret < 0) {
 			ret = 0;
-			goto next;
+			goto end_trans;
 		}
 
-		/*
-		 * Want to do this before we do anything else so we can recover
-		 * properly if we fail to join the transaction.
-		 */
-		trans = btrfs_start_trans_remove_block_group(fs_info,
-						     block_group->key.objectid);
-		if (IS_ERR(trans)) {
+		rsv = create_block_rsv_remove_block_group(root,
+					block_group->key.objectid);
+		if (IS_ERR(rsv)) {
 			btrfs_dec_block_group_ro(root, block_group);
 			ret = PTR_ERR(trans);
-			goto next;
+			goto end_trans;
 		}
 
+		ret =  btrfs_block_rsv_migrate(rsv, &fs_info->trans_block_rsv,
+					       rsv->size, 1);
+		if (ret) {
+			btrfs_free_block_rsv(root, rsv);
+			btrfs_dec_block_group_ro(root, block_group);
+			goto end_trans;
+		}
+		trans->block_rsv = &fs_info->trans_block_rsv;
+		trans->bytes_reserved = rsv->size;
+		btrfs_free_block_rsv(root, rsv);
 		/*
 		 * We could have pending pinned extents for this block group,
 		 * just delete them, we don't care about them anymore.
@@ -10923,8 +11032,7 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 end_trans:
 		btrfs_end_transaction(trans, root);
-next:
-		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+		up_write(&root->fs_info->bg_delete_sem);
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e217035..319cda0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2915,7 +2915,7 @@  static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 	 * we release the path used to search the chunk/dev tree and before
 	 * the current task acquires this mutex and calls us.
 	 */
-	ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+	ASSERT(rwsem_is_locked(&root->fs_info->bg_delete_sem));
 
 	ret = btrfs_can_relocate(extent_root, chunk_offset);
 	if (ret)
@@ -2968,10 +2968,10 @@  again:
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
-		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+		down_read(&root->fs_info->bg_delete_sem);
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0) {
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 			goto error;
 		}
 		BUG_ON(ret == 0); /* Corruption */
@@ -2979,7 +2979,7 @@  again:
 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
 					  key.type);
 		if (ret)
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 		if (ret < 0)
 			goto error;
 		if (ret > 0)
@@ -3001,7 +3001,7 @@  again:
 			else
 				BUG_ON(ret);
 		}
-		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+		up_read(&root->fs_info->bg_delete_sem);
 
 		if (found_key.offset == 0)
 			break;
@@ -3557,10 +3557,10 @@  again:
 			goto error;
 		}
 
-		mutex_lock(&fs_info->delete_unused_bgs_mutex);
+		down_read(&fs_info->bg_delete_sem);
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			goto error;
 		}
 
@@ -3574,7 +3574,7 @@  again:
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
 		if (ret) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			ret = 0;
 			break;
 		}
@@ -3584,7 +3584,7 @@  again:
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid != key.objectid) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			break;
 		}
 
@@ -3602,12 +3602,12 @@  again:
 
 		btrfs_release_path(path);
 		if (!ret) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			goto loop;
 		}
 
 		if (counting) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			spin_lock(&fs_info->balance_lock);
 			bctl->stat.expected++;
 			spin_unlock(&fs_info->balance_lock);
@@ -3632,7 +3632,7 @@  again:
 					count_meta < bctl->meta.limit_min)
 				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
 					count_sys < bctl->sys.limit_min)) {
-			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			up_read(&fs_info->bg_delete_sem);
 			goto loop;
 		}
 
@@ -3645,7 +3645,7 @@  again:
 		    !chunk_reserved && !bytes_used) {
 			trans = btrfs_start_transaction(chunk_root, 0);
 			if (IS_ERR(trans)) {
-				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+				up_read(&fs_info->bg_delete_sem);
 				ret = PTR_ERR(trans);
 				goto error;
 			}
@@ -3654,7 +3654,7 @@  again:
 						      BTRFS_BLOCK_GROUP_DATA);
 			btrfs_end_transaction(trans, chunk_root);
 			if (ret < 0) {
-				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+				up_read(&fs_info->bg_delete_sem);
 				goto error;
 			}
 			chunk_reserved = 1;
@@ -3662,7 +3662,7 @@  again:
 
 		ret = btrfs_relocate_chunk(chunk_root,
 					   found_key.offset);
-		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+		up_read(&fs_info->bg_delete_sem);
 		if (ret && ret != -ENOSPC)
 			goto error;
 		if (ret == -ENOSPC) {
@@ -4389,16 +4389,16 @@  again:
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	do {
-		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+		down_read(&root->fs_info->bg_delete_sem);
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0) {
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 			goto done;
 		}
 
 		ret = btrfs_previous_item(root, path, 0, key.type);
 		if (ret)
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 		if (ret < 0)
 			goto done;
 		if (ret) {
@@ -4412,7 +4412,7 @@  again:
 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
 		if (key.objectid != device->devid) {
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 			btrfs_release_path(path);
 			break;
 		}
@@ -4421,7 +4421,7 @@  again:
 		length = btrfs_dev_extent_length(l, dev_extent);
 
 		if (key.offset + length <= new_size) {
-			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+			up_read(&root->fs_info->bg_delete_sem);
 			btrfs_release_path(path);
 			break;
 		}
@@ -4430,7 +4430,7 @@  again:
 		btrfs_release_path(path);
 
 		ret = btrfs_relocate_chunk(root, chunk_offset);
-		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+		up_read(&root->fs_info->bg_delete_sem);
 		if (ret && ret != -ENOSPC)
 			goto done;
 		if (ret == -ENOSPC)