@@ -788,6 +788,7 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+ struct rw_semaphore bg_delete_sem;
/*
* this is taken to make sure we don't set block groups ro after
@@ -1068,7 +1069,6 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
- struct mutex delete_unused_bgs_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -1837,12 +1837,11 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(root->fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
- * with relocation (btrfs_relocate_chunk) and relocation
- * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
- * can't hold, nor need to, fs_info->cleaner_mutex when deleting
- * unused block groups.
+ * Acquires fs_info->bg_delete_sem to avoid racing with
+ * relocation (btrfs_relocate_chunk) and relocation acquires
+ * fs_info->cleaner_mutex (btrfs_relocate_block_group) after
+ * acquiring fs_info->bg_delete_sem. So we can't hold, nor need
+ * to, fs_info->cleaner_mutex when deleting unused block groups.
*/
btrfs_delete_unused_bgs(root->fs_info);
sleep:
@@ -2603,7 +2602,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->cleaner_delayed_iput_mutex);
@@ -2691,6 +2689,7 @@ int open_ctree(struct super_block *sb,
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
+ init_rwsem(&fs_info->bg_delete_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
btrfs_init_dev_replace_locks(fs_info);
@@ -4128,6 +4128,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
int ret = 0;
int need_commit = 2;
int have_pinned_space;
+ int have_bg_delete_sem = 0;
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
@@ -4138,8 +4139,11 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
}
data_sinfo = fs_info->data_sinfo;
- if (!data_sinfo)
+ if (!data_sinfo) {
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
goto alloc;
+ }
again:
/* make sure we have enough space to handle the data first */
@@ -4152,6 +4156,17 @@ again:
struct btrfs_trans_handle *trans;
/*
+ * We may need to allocate new chunk, so we should block
+ * btrfs_delete_unused_bgs()
+ */
+ if (!have_bg_delete_sem) {
+ spin_unlock(&data_sinfo->lock);
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
+ goto again;
+ }
+
+ /*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
@@ -4173,17 +4188,20 @@ alloc:
* the fs.
*/
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ up_read(&root->fs_info->bg_delete_sem);
return PTR_ERR(trans);
+ }
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
if (ret < 0) {
- if (ret != -ENOSPC)
+ if (ret != -ENOSPC) {
+ up_read(&root->fs_info->bg_delete_sem);
return ret;
- else {
+ } else {
have_pinned_space = 1;
goto commit_trans;
}
@@ -4217,15 +4235,19 @@ commit_trans:
}
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ up_read(&root->fs_info->bg_delete_sem);
return PTR_ERR(trans);
+ }
if (have_pinned_space >= 0 ||
test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
&trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
- if (ret)
+ if (ret) {
+ up_read(&root->fs_info->bg_delete_sem);
return ret;
+ }
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
@@ -4242,6 +4264,7 @@ commit_trans:
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
data_sinfo->flags, bytes, 1);
+ up_read(&root->fs_info->bg_delete_sem);
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -4249,6 +4272,9 @@ commit_trans:
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
+ if (have_bg_delete_sem)
+ up_read(&root->fs_info->bg_delete_sem);
+
return ret;
}
@@ -10763,6 +10789,65 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
num_items, 1);
}
+static struct btrfs_block_rsv *
+create_block_rsv_remove_block_group(struct btrfs_root *root,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &root->fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+ struct btrfs_block_rsv *rsv;
+ int ret;
+ u64 num_bytes;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = em->map_lookup;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv)
+ return ERR_PTR(-ENOMEM);
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (!ret)
+ return rsv;
+
+ ret = btrfs_cond_migrate_bytes(root->fs_info, rsv, num_bytes, 1);
+ if (ret) {
+ btrfs_free_block_rsv(root, rsv);
+ return ERR_PTR(ret);
+ }
+
+ return rsv;
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -10773,6 +10858,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
struct btrfs_space_info *space_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv *rsv;
int ret = 0;
if (!fs_info->open)
@@ -10796,7 +10882,24 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ /*
+ * there is a lock order between "wait transaction finished" and
+ * down_write(bg_delete_sem), so here we first create a
+ * transaction handle with zero bytes reserved. Because the work
+ * to calculate metadata bytes need to be protected by
+ * bg_delete_sem, see btrfs_start_trans_remove_block_group()
+ * about how to calculate metadata bytes, so we do the metadata
+ * reservation later.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ continue;
+ }
+
+ down_write(&root->fs_info->bg_delete_sem);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -10813,7 +10916,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
spin_unlock(&block_group->lock);
up_write(&space_info->groups_sem);
- goto next;
+ goto end_trans;
}
spin_unlock(&block_group->lock);
@@ -10822,21 +10925,27 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
- goto next;
+ goto end_trans;
}
- /*
- * Want to do this before we do anything else so we can recover
- * properly if we fail to join the transaction.
- */
- trans = btrfs_start_trans_remove_block_group(fs_info,
- block_group->key.objectid);
- if (IS_ERR(trans)) {
+ rsv = create_block_rsv_remove_block_group(root,
+ block_group->key.objectid);
+ if (IS_ERR(rsv)) {
btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
- goto next;
+ goto end_trans;
}
+ ret = btrfs_block_rsv_migrate(rsv, &fs_info->trans_block_rsv,
+ rsv->size, 1);
+ if (ret) {
+ btrfs_free_block_rsv(root, rsv);
+ btrfs_dec_block_group_ro(root, block_group);
+ goto end_trans;
+ }
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = rsv->size;
+ btrfs_free_block_rsv(root, rsv);
/*
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
@@ -10923,8 +11032,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
end_trans:
btrfs_end_transaction(trans, root);
-next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_write(&root->fs_info->bg_delete_sem);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -2915,7 +2915,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+ ASSERT(rwsem_is_locked(&root->fs_info->bg_delete_sem));
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
@@ -2968,10 +2968,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ down_read(&root->fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@@ -2979,7 +2979,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret < 0)
goto error;
if (ret > 0)
@@ -3001,7 +3001,7 @@ again:
else
BUG_ON(ret);
}
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (found_key.offset == 0)
break;
@@ -3557,10 +3557,10 @@ again:
goto error;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ down_read(&fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto error;
}
@@ -3574,7 +3574,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
ret = 0;
break;
}
@@ -3584,7 +3584,7 @@ again:
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
break;
}
@@ -3602,12 +3602,12 @@ again:
btrfs_release_path(path);
if (!ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto loop;
}
if (counting) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3632,7 +3632,7 @@ again:
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto loop;
}
@@ -3645,7 +3645,7 @@ again:
!chunk_reserved && !bytes_used) {
trans = btrfs_start_transaction(chunk_root, 0);
if (IS_ERR(trans)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
ret = PTR_ERR(trans);
goto error;
}
@@ -3654,7 +3654,7 @@ again:
BTRFS_BLOCK_GROUP_DATA);
btrfs_end_transaction(trans, chunk_root);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto error;
}
chunk_reserved = 1;
@@ -3662,7 +3662,7 @@ again:
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
@@ -4389,16 +4389,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ down_read(&root->fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret < 0)
goto done;
if (ret) {
@@ -4412,7 +4412,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
btrfs_release_path(path);
break;
}
@@ -4421,7 +4421,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
btrfs_release_path(path);
break;
}
@@ -4430,7 +4430,7 @@ again:
btrfs_release_path(path);
ret = btrfs_relocate_chunk(root, chunk_offset);
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs() to delete unused block groups. Because this work is asynchronous, it may also result in false ENOSPC error. Please see below race window: CPU1 | CPU2 | |-> btrfs_alloc_data_chunk_ondemand() |-> cleaner_kthread() |-> do_chunk_alloc() | | | assume it returns ENOSPC, which means | | | btrfs_space_info is full and have free| | | space to satisfy data request. | | | | |- > btrfs_delete_unused_bgs() | | | it will decrease btrfs_space_info | | | total_bytes and make | | | btrfs_space_info is not full. | | | In this case, we may get ENOSPC error, but btrfs_space_info is not full. To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs(). Here we introduce a new struct rw_semaphore bg_delete_sem to do this job. Indeed there is already a "struct mutex delete_unused_bgs_mutex", but it's mutex, we can not use it for this purpose. Of course, we can re-define it to be struct rw_semaphore, then use it in btrfs_alloc_data_chunk_ondemand(). Either method will work. But given that delete_unused_bgs_mutex's name length is longer than bg_delete_sem, I choose the first method, to create a new struct rw_semaphore bg_delete_sem and delete delete_unused_bgs_mutex :) But after introducing this new rw_semaphore bg_delete_sem, there may occur a deadlock, fstests test case btrfs/071 have revealed this deadlock, please see call graph below: CPU1 | CPU2 |-> btrfs_commit_transaction() |-> cleaner_kthread() |-> commit_cowonly_roots() | |-> btrfs_delete_unused_bgs() |-> btrfs_setup_space_cache() | |-> down_write(bg_delete_sem) |-> cache_save_setup() | | |-> btrfs_check_data_free_space() | | |-> btrfs_alloc_data_chunk_ondemand() | |-> btrfs_start_trans_remove_block_group() |-> down_read(bg_delete_sem) | |-> wait_current_trans() Indeed there is a lock order between wait-current-trans-finished and down_write(bg_delete_sem), so in btrfs_delete_unused_bgs(), we must obey this lock rule. We call btrfs_start_transaction(root, 0) before down_write(bg_delete_sem) in btrfs_delete_unused_bgs(). In balance codes, we also down_read(bg_delete_sem) before start_transaction(), but becasue bg_delete_sem is rw_semaphore, there will be no deadlock. Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com> --- V2: fix a deadlock revealed by fstests case btrfs/071, we call start_transaction() before in down_write(bg_delete_sem) in btrfs_delete_unused_bgs(). --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 13 ++--- fs/btrfs/extent-tree.c | 146 ++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.c | 42 +++++++------- 4 files changed, 155 insertions(+), 48 deletions(-)