diff mbox

[v3] btrfs: should block unused block groups deletion work when allocating data space

Message ID 20160726120420.29692-1-wangxg.fnst@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show

Commit Message

Xiaoguang Wang July 26, 2016, 12:04 p.m. UTC
cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs()
to delete unused block groups. Because this work is asynchronous, it may also result
in false ENOSPC error. Please see below race window:

               CPU1                           |             CPU2
                                              |
|-> btrfs_alloc_data_chunk_ondemand()         |-> cleaner_kthread()
    |-> do_chunk_alloc()                      |   |
    |   assume it returns ENOSPC, which means |   |
    |   btrfs_space_info is full and have free|   |
    |   space to satisfy data request.        |   |
    |                                         |   |- > btrfs_delete_unused_bgs()
    |                                         |   |    it will decrease btrfs_space_info
    |                                         |   |    total_bytes and make
    |                                         |   |    btrfs_space_info is not full.
    |                                         |   |
In this case, we may get ENOSPC error, but btrfs_space_info is not full.

To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call
do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs().
So here we introduce a new struct rw_semaphore bg_delete_sem to do this job.

Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
---
v3: improve one code logic suggested by Josef, thanks.
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 44 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 40 insertions(+), 6 deletions(-)

Comments

Josef Bacik July 26, 2016, 3:51 p.m. UTC | #1
On 07/26/2016 08:04 AM, Wang Xiaoguang wrote:
> cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs()
> to delete unused block groups. Because this work is asynchronous, it may also result
> in false ENOSPC error. Please see below race window:
>
>                CPU1                           |             CPU2
>                                               |
> |-> btrfs_alloc_data_chunk_ondemand()         |-> cleaner_kthread()
>     |-> do_chunk_alloc()                      |   |
>     |   assume it returns ENOSPC, which means |   |
>     |   btrfs_space_info is full and have free|   |
>     |   space to satisfy data request.        |   |
>     |                                         |   |- > btrfs_delete_unused_bgs()
>     |                                         |   |    it will decrease btrfs_space_info
>     |                                         |   |    total_bytes and make
>     |                                         |   |    btrfs_space_info is not full.
>     |                                         |   |
> In this case, we may get ENOSPC error, but btrfs_space_info is not full.
>
> To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call
> do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs().
> So here we introduce a new struct rw_semaphore bg_delete_sem to do this job.
>
> Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
> ---
> v3: improve one code logic suggested by Josef, thanks.
> ---
>  fs/btrfs/ctree.h       |  1 +
>  fs/btrfs/disk-io.c     |  1 +
>  fs/btrfs/extent-tree.c | 44 ++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 40 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 7eb2913..bf0751d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -800,6 +800,7 @@ struct btrfs_fs_info {
>  	struct mutex cleaner_mutex;
>  	struct mutex chunk_mutex;
>  	struct mutex volume_mutex;
> +	struct rw_semaphore bg_delete_sem;
>
>  	/*
>  	 * this is taken to make sure we don't set block groups ro after
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 60ce119..f8609fd 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2683,6 +2683,7 @@ int open_ctree(struct super_block *sb,
>  	init_rwsem(&fs_info->commit_root_sem);
>  	init_rwsem(&fs_info->cleanup_work_sem);
>  	init_rwsem(&fs_info->subvol_sem);
> +	init_rwsem(&fs_info->bg_delete_sem);
>  	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>
>  	btrfs_init_dev_replace_locks(fs_info);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index df8d756..50b440e 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4111,6 +4111,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
>  	int ret = 0;
>  	int need_commit = 2;
>  	int have_pinned_space;
> +	int have_bg_delete_sem = 0;
>
>  	/* make sure bytes are sectorsize aligned */
>  	bytes = ALIGN(bytes, root->sectorsize);
> @@ -4121,8 +4122,11 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
>  	}
>
>  	data_sinfo = fs_info->data_sinfo;
> -	if (!data_sinfo)
> +	if (!data_sinfo) {
> +		down_read(&root->fs_info->bg_delete_sem);
> +		have_bg_delete_sem = 1;
>  		goto alloc;
> +	}
>
>  again:
>  	/* make sure we have enough space to handle the data first */
> @@ -4135,6 +4139,19 @@ again:
>  		struct btrfs_trans_handle *trans;
>
>  		/*
> +		 * We may need to allocate new chunk, so we should block
> +		 * btrfs_delete_unused_bgs()
> +		 */
> +		if (!have_bg_delete_sem) {
> +			spin_unlock(&data_sinfo->lock);
> +			down_read(&root->fs_info->bg_delete_sem);
> +			have_bg_delete_sem = 1;
> +			spin_lock(&data_sinfo->lock);
> +			if (used + bytes <= data_sinfo->total_bytes)

Doh sorry I forgot to mention you need to re-calculate used here.  Also I 
noticed we already have a delete_unused_bgs_mutex, can we drop that and replace 
it with bg_delete_sem as well?  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba July 26, 2016, 4:25 p.m. UTC | #2
On Tue, Jul 26, 2016 at 11:51:39AM -0400, Josef Bacik wrote:
> Also I 
> noticed we already have a delete_unused_bgs_mutex, can we drop that and replace 
> it with bg_delete_sem as well?  Thanks,

Oh, yes please, making sense of all the mutexes and semaphores gets
harder with every new one.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiaoguang Wang July 27, 2016, 1:26 a.m. UTC | #3
hello,

On 07/26/2016 11:51 PM, Josef Bacik wrote:
> On 07/26/2016 08:04 AM, Wang Xiaoguang wrote:
>> cleaner_kthread() may run at any time, in which it'll call 
>> btrfs_delete_unused_bgs()
>> to delete unused block groups. Because this work is asynchronous, it 
>> may also result
>> in false ENOSPC error. Please see below race window:
>>
>>                CPU1                           |             CPU2
>>                                               |
>> |-> btrfs_alloc_data_chunk_ondemand()         |-> cleaner_kthread()
>>     |-> do_chunk_alloc()                      |   |
>>     |   assume it returns ENOSPC, which means |   |
>>     |   btrfs_space_info is full and have free|   |
>>     |   space to satisfy data request.        |   |
>>     |                                         |   |- > 
>> btrfs_delete_unused_bgs()
>>     |                                         |   |    it will 
>> decrease btrfs_space_info
>>     |                                         |   | total_bytes and make
>>     |                                         |   | btrfs_space_info 
>> is not full.
>>     |                                         |   |
>> In this case, we may get ENOSPC error, but btrfs_space_info is not full.
>>
>> To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need 
>> to call
>> do_chunk_alloc() to allocating new chunk, we should block 
>> btrfs_delete_unused_bgs().
>> So here we introduce a new struct rw_semaphore bg_delete_sem to do 
>> this job.
>>
>> Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
>> ---
>> v3: improve one code logic suggested by Josef, thanks.
>> ---
>>  fs/btrfs/ctree.h       |  1 +
>>  fs/btrfs/disk-io.c     |  1 +
>>  fs/btrfs/extent-tree.c | 44 
>> ++++++++++++++++++++++++++++++++++++++------
>>  3 files changed, 40 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 7eb2913..bf0751d 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -800,6 +800,7 @@ struct btrfs_fs_info {
>>      struct mutex cleaner_mutex;
>>      struct mutex chunk_mutex;
>>      struct mutex volume_mutex;
>> +    struct rw_semaphore bg_delete_sem;
>>
>>      /*
>>       * this is taken to make sure we don't set block groups ro after
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 60ce119..f8609fd 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -2683,6 +2683,7 @@ int open_ctree(struct super_block *sb,
>>      init_rwsem(&fs_info->commit_root_sem);
>>      init_rwsem(&fs_info->cleanup_work_sem);
>>      init_rwsem(&fs_info->subvol_sem);
>> +    init_rwsem(&fs_info->bg_delete_sem);
>>      sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>>
>>      btrfs_init_dev_replace_locks(fs_info);
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index df8d756..50b440e 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -4111,6 +4111,7 @@ int btrfs_alloc_data_chunk_ondemand(struct 
>> inode *inode, u64 bytes)
>>      int ret = 0;
>>      int need_commit = 2;
>>      int have_pinned_space;
>> +    int have_bg_delete_sem = 0;
>>
>>      /* make sure bytes are sectorsize aligned */
>>      bytes = ALIGN(bytes, root->sectorsize);
>> @@ -4121,8 +4122,11 @@ int btrfs_alloc_data_chunk_ondemand(struct 
>> inode *inode, u64 bytes)
>>      }
>>
>>      data_sinfo = fs_info->data_sinfo;
>> -    if (!data_sinfo)
>> +    if (!data_sinfo) {
>> +        down_read(&root->fs_info->bg_delete_sem);
>> +        have_bg_delete_sem = 1;
>>          goto alloc;
>> +    }
>>
>>  again:
>>      /* make sure we have enough space to handle the data first */
>> @@ -4135,6 +4139,19 @@ again:
>>          struct btrfs_trans_handle *trans;
>>
>>          /*
>> +         * We may need to allocate new chunk, so we should block
>> +         * btrfs_delete_unused_bgs()
>> +         */
>> +        if (!have_bg_delete_sem) {
>> +            spin_unlock(&data_sinfo->lock);
>> +            down_read(&root->fs_info->bg_delete_sem);
>> +            have_bg_delete_sem = 1;
>> +            spin_lock(&data_sinfo->lock);
>> +            if (used + bytes <= data_sinfo->total_bytes)
>
> Doh sorry I forgot to mention you need to re-calculate used here. Also 
> I noticed we already have a delete_unused_bgs_mutex, can we drop that 
> and replace it with bg_delete_sem as well?  Thanks,
Sorry, I also forgot to re-calculate used...
OK, I'll try to replace delete_unused_bgs_mutex with bg_delete_sem, 
please wait a while :)

Regards,
Xiaoguang Wang

>
> Josef
>
>



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7eb2913..bf0751d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -800,6 +800,7 @@  struct btrfs_fs_info {
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
+	struct rw_semaphore bg_delete_sem;
 
 	/*
 	 * this is taken to make sure we don't set block groups ro after
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60ce119..f8609fd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2683,6 +2683,7 @@  int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->commit_root_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	init_rwsem(&fs_info->bg_delete_sem);
 	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
 
 	btrfs_init_dev_replace_locks(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df8d756..50b440e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4111,6 +4111,7 @@  int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 	int ret = 0;
 	int need_commit = 2;
 	int have_pinned_space;
+	int have_bg_delete_sem = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
@@ -4121,8 +4122,11 @@  int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 	}
 
 	data_sinfo = fs_info->data_sinfo;
-	if (!data_sinfo)
+	if (!data_sinfo) {
+		down_read(&root->fs_info->bg_delete_sem);
+		have_bg_delete_sem = 1;
 		goto alloc;
+	}
 
 again:
 	/* make sure we have enough space to handle the data first */
@@ -4135,6 +4139,19 @@  again:
 		struct btrfs_trans_handle *trans;
 
 		/*
+		 * We may need to allocate new chunk, so we should block
+		 * btrfs_delete_unused_bgs()
+		 */
+		if (!have_bg_delete_sem) {
+			spin_unlock(&data_sinfo->lock);
+			down_read(&root->fs_info->bg_delete_sem);
+			have_bg_delete_sem = 1;
+			spin_lock(&data_sinfo->lock);
+			if (used + bytes <= data_sinfo->total_bytes)
+				goto done;
+		}
+
+		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
@@ -4156,17 +4173,20 @@  alloc:
 			 * the fs.
 			 */
 			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
+			if (IS_ERR(trans)) {
+				up_read(&root->fs_info->bg_delete_sem);
 				return PTR_ERR(trans);
+			}
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     alloc_target,
 					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
 			if (ret < 0) {
-				if (ret != -ENOSPC)
+				if (ret != -ENOSPC) {
+					up_read(&root->fs_info->bg_delete_sem);
 					return ret;
-				else {
+				} else {
 					have_pinned_space = 1;
 					goto commit_trans;
 				}
@@ -4200,15 +4220,19 @@  commit_trans:
 			}
 
 			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
+			if (IS_ERR(trans)) {
+				up_read(&root->fs_info->bg_delete_sem);
 				return PTR_ERR(trans);
+			}
 			if (have_pinned_space >= 0 ||
 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 				     &trans->transaction->flags) ||
 			    need_commit > 0) {
 				ret = btrfs_commit_transaction(trans, root);
-				if (ret)
+				if (ret) {
+					up_read(&root->fs_info->bg_delete_sem);
 					return ret;
+				}
 				/*
 				 * The cleaner kthread might still be doing iput
 				 * operations. Wait for it to finish so that
@@ -4225,13 +4249,19 @@  commit_trans:
 		trace_btrfs_space_reservation(root->fs_info,
 					      "space_info:enospc",
 					      data_sinfo->flags, bytes, 1);
+		up_read(&root->fs_info->bg_delete_sem);
 		return -ENOSPC;
 	}
+
+done:
 	data_sinfo->bytes_may_use += bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
+	if (have_bg_delete_sem)
+		up_read(&root->fs_info->bg_delete_sem);
+
 	return ret;
 }
 
@@ -10594,6 +10624,7 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_unlock(&fs_info->unused_bgs_lock);
 
 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
+		down_write(&root->fs_info->bg_delete_sem);
 
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
@@ -10721,6 +10752,7 @@  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 end_trans:
 		btrfs_end_transaction(trans, root);
 next:
+		up_write(&root->fs_info->bg_delete_sem);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);