[1/2] btrfs-progs: Avoid nested chunk allocation call
diff mbox series

Message ID 20190416102144.12173-2-wqu@suse.com
State New
Headers show
Series
  • btrfs-progs: Metadata preallocation enhancement
Related show

Commit Message

Qu Wenruo April 16, 2019, 10:21 a.m. UTC
There is a hidden call loop which can trigger itself:

btrfs_reserve_extent()             <--|
|- do_chunk_alloc()                   |
   |- btrfs_alloc_chunk()             |
      |- btrfs_insert_item()          |
	 |- btrfs_reserve_extent() <--|

Currently, we're using root->ref_cows to determine whether we should do
chunk prealloc to avoid such loop.

But that's still a hidden trap. Instead of solving it using some hidden
tricks, this patch will make chunk/block group allocation exclusive.

Now if do_chunk_alloc() determines to alloc chunk, it will set a special
flag in transaction handler so it new do_chunk_alloc() will refuse to
allocate new chunk until current chunk allocation finishes.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 check/main.c  |  2 +-
 extent-tree.c | 12 ++++++++++++
 transaction.c |  3 ++-
 transaction.h |  3 ++-
 4 files changed, 17 insertions(+), 3 deletions(-)

Comments

Filipe Manana April 16, 2019, 11:22 a.m. UTC | #1
On Tue, Apr 16, 2019 at 11:23 AM Qu Wenruo <wqu@suse.com> wrote:
>
> There is a hidden call loop which can trigger itself:
>
> btrfs_reserve_extent()             <--|
> |- do_chunk_alloc()                   |
>    |- btrfs_alloc_chunk()             |
>       |- btrfs_insert_item()          |
>          |- btrfs_reserve_extent() <--|
>
> Currently, we're using root->ref_cows to determine whether we should do
> chunk prealloc to avoid such loop.
>
> But that's still a hidden trap. Instead of solving it using some hidden
> tricks, this patch will make chunk/block group allocation exclusive.
>
> Now if do_chunk_alloc() determines to alloc chunk, it will set a special
> flag in transaction handler so it new do_chunk_alloc() will refuse to
> allocate new chunk until current chunk allocation finishes.

What if the chunk allocated during the recursion is of a different type?
I.e. we're allocating a data chunk, then when inserting the items in the
extent, chunk, device, etc trees, we run out of metadata data space and
need to allocate a metadata chunk. What you are doing skips the allocation
of the metadata chunk and makes the inserts/updates of the trees fail
with ENOSPC.

thanks



>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  check/main.c  |  2 +-
>  extent-tree.c | 12 ++++++++++++
>  transaction.c |  3 ++-
>  transaction.h |  3 ++-
>  4 files changed, 17 insertions(+), 3 deletions(-)
>
> diff --git a/check/main.c b/check/main.c
> index 683c322eba6f..121be4b73c4f 100644
> --- a/check/main.c
> +++ b/check/main.c
> @@ -10185,7 +10185,7 @@ int cmd_check(int argc, char **argv)
>                         goto close_out;
>                 }
>
> -               trans->reinit_extent_tree = true;
> +               trans->reinit_extent_tree = 1;
>                 if (init_extent_tree) {
>                         printf("Creating a new extent tree\n");
>                         ret = reinit_extent_tree(trans, info,
> diff --git a/extent-tree.c b/extent-tree.c
> index 8c9cdeff3b02..e25ddf02e5cc 100644
> --- a/extent-tree.c
> +++ b/extent-tree.c
> @@ -1873,10 +1873,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>             (flags & BTRFS_BLOCK_GROUP_SYSTEM))
>                 return 0;
>
> +       /*
> +        * We're going to allocate new chunk, during the process, we will
> +        * allocate new tree blocks, which can trigger new chunk allocation
> +        * again.
> +        * Avoid such loop call
> +        */
> +       if (trans->allocating_chunk)
> +               return 0;
> +       trans->allocating_chunk = 1;
> +
>         ret = btrfs_alloc_chunk(trans, fs_info, &start, &num_bytes,
>                                 space_info->flags);
>         if (ret == -ENOSPC) {
>                 space_info->full = 1;
> +               trans->allocating_chunk = 0;
>                 return 0;
>         }
>
> @@ -1885,6 +1896,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>         ret = btrfs_make_block_group(trans, fs_info, 0, space_info->flags,
>                                      start, num_bytes);
>         BUG_ON(ret);
> +       trans->allocating_chunk = 0;
>         return 0;
>  }
>
> diff --git a/transaction.c b/transaction.c
> index 3a63988b0969..21377282f806 100644
> --- a/transaction.c
> +++ b/transaction.c
> @@ -46,7 +46,8 @@ struct btrfs_trans_handle* btrfs_start_transaction(struct btrfs_root *root,
>         fs_info->generation++;
>         h->transid = fs_info->generation;
>         h->blocks_reserved = num_blocks;
> -       h->reinit_extent_tree = false;
> +       h->reinit_extent_tree = 0;
> +       h->allocating_chunk = 0;
>         root->last_trans = h->transid;
>         root->commit_root = root->node;
>         extent_buffer_get(root->node);
> diff --git a/transaction.h b/transaction.h
> index 34060252dd5c..b426cd112447 100644
> --- a/transaction.h
> +++ b/transaction.h
> @@ -28,7 +28,8 @@ struct btrfs_trans_handle {
>         u64 transid;
>         u64 alloc_exclude_start;
>         u64 alloc_exclude_nr;
> -       bool reinit_extent_tree;
> +       unsigned int reinit_extent_tree:1;
> +       unsigned int allocating_chunk:1;
>         u64 delayed_ref_updates;
>         unsigned long blocks_reserved;
>         unsigned long blocks_used;
> --
> 2.21.0
>
Qu Wenruo April 16, 2019, 11:32 a.m. UTC | #2
On 2019/4/16 下午7:22, Filipe Manana wrote:
> On Tue, Apr 16, 2019 at 11:23 AM Qu Wenruo <wqu@suse.com> wrote:
>>
>> There is a hidden call loop which can trigger itself:
>>
>> btrfs_reserve_extent()             <--|
>> |- do_chunk_alloc()                   |
>>    |- btrfs_alloc_chunk()             |
>>       |- btrfs_insert_item()          |
>>          |- btrfs_reserve_extent() <--|
>>
>> Currently, we're using root->ref_cows to determine whether we should do
>> chunk prealloc to avoid such loop.
>>
>> But that's still a hidden trap. Instead of solving it using some hidden
>> tricks, this patch will make chunk/block group allocation exclusive.
>>
>> Now if do_chunk_alloc() determines to alloc chunk, it will set a special
>> flag in transaction handler so it new do_chunk_alloc() will refuse to
>> allocate new chunk until current chunk allocation finishes.
> 
> What if the chunk allocated during the recursion is of a different type?
> I.e. we're allocating a data chunk, then when inserting the items in the
> extent, chunk, device, etc trees, we run out of metadata data space and
> need to allocate a metadata chunk. What you are doing skips the allocation
> of the metadata chunk and makes the inserts/updates of the trees fail
> with ENOSPC.

That's why we do preallocation to avoid such problem.

Just as the old code shows, even when we're allocating data chunks, it
will try to check metadata space first.

And for the profile we're asking, we over-allocate 2M, definitely not
the best solution but should be enough for extent/chunk tree update.

To make it as good as kernel, we need a lot of accounting which is not
here in btrfs-progs.

So in your case, every btrfs_reserve_extent() call should either trigger
 chunk allocation (either metadata or data, or both), or have 2M overhead.

If that 2M can not meet the metadata space requirement for
chunk/extent/root tree update, then we hit the problem you described.

However 2M looks pretty generous to me, so it should just work.

Thanks,
Qu

> 
> thanks
> 
> 
> 
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  check/main.c  |  2 +-
>>  extent-tree.c | 12 ++++++++++++
>>  transaction.c |  3 ++-
>>  transaction.h |  3 ++-
>>  4 files changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/check/main.c b/check/main.c
>> index 683c322eba6f..121be4b73c4f 100644
>> --- a/check/main.c
>> +++ b/check/main.c
>> @@ -10185,7 +10185,7 @@ int cmd_check(int argc, char **argv)
>>                         goto close_out;
>>                 }
>>
>> -               trans->reinit_extent_tree = true;
>> +               trans->reinit_extent_tree = 1;
>>                 if (init_extent_tree) {
>>                         printf("Creating a new extent tree\n");
>>                         ret = reinit_extent_tree(trans, info,
>> diff --git a/extent-tree.c b/extent-tree.c
>> index 8c9cdeff3b02..e25ddf02e5cc 100644
>> --- a/extent-tree.c
>> +++ b/extent-tree.c
>> @@ -1873,10 +1873,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>>             (flags & BTRFS_BLOCK_GROUP_SYSTEM))
>>                 return 0;
>>
>> +       /*
>> +        * We're going to allocate new chunk, during the process, we will
>> +        * allocate new tree blocks, which can trigger new chunk allocation
>> +        * again.
>> +        * Avoid such loop call
>> +        */
>> +       if (trans->allocating_chunk)
>> +               return 0;
>> +       trans->allocating_chunk = 1;
>> +
>>         ret = btrfs_alloc_chunk(trans, fs_info, &start, &num_bytes,
>>                                 space_info->flags);
>>         if (ret == -ENOSPC) {
>>                 space_info->full = 1;
>> +               trans->allocating_chunk = 0;
>>                 return 0;
>>         }
>>
>> @@ -1885,6 +1896,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>>         ret = btrfs_make_block_group(trans, fs_info, 0, space_info->flags,
>>                                      start, num_bytes);
>>         BUG_ON(ret);
>> +       trans->allocating_chunk = 0;
>>         return 0;
>>  }
>>
>> diff --git a/transaction.c b/transaction.c
>> index 3a63988b0969..21377282f806 100644
>> --- a/transaction.c
>> +++ b/transaction.c
>> @@ -46,7 +46,8 @@ struct btrfs_trans_handle* btrfs_start_transaction(struct btrfs_root *root,
>>         fs_info->generation++;
>>         h->transid = fs_info->generation;
>>         h->blocks_reserved = num_blocks;
>> -       h->reinit_extent_tree = false;
>> +       h->reinit_extent_tree = 0;
>> +       h->allocating_chunk = 0;
>>         root->last_trans = h->transid;
>>         root->commit_root = root->node;
>>         extent_buffer_get(root->node);
>> diff --git a/transaction.h b/transaction.h
>> index 34060252dd5c..b426cd112447 100644
>> --- a/transaction.h
>> +++ b/transaction.h
>> @@ -28,7 +28,8 @@ struct btrfs_trans_handle {
>>         u64 transid;
>>         u64 alloc_exclude_start;
>>         u64 alloc_exclude_nr;
>> -       bool reinit_extent_tree;
>> +       unsigned int reinit_extent_tree:1;
>> +       unsigned int allocating_chunk:1;
>>         u64 delayed_ref_updates;
>>         unsigned long blocks_reserved;
>>         unsigned long blocks_used;
>> --
>> 2.21.0
>>
> 
>
Filipe Manana April 16, 2019, 11:40 a.m. UTC | #3
On Tue, Apr 16, 2019 at 12:32 PM Qu Wenruo <wqu@suse.de> wrote:
>
>
>
> On 2019/4/16 下午7:22, Filipe Manana wrote:
> > On Tue, Apr 16, 2019 at 11:23 AM Qu Wenruo <wqu@suse.com> wrote:
> >>
> >> There is a hidden call loop which can trigger itself:
> >>
> >> btrfs_reserve_extent()             <--|
> >> |- do_chunk_alloc()                   |
> >>    |- btrfs_alloc_chunk()             |
> >>       |- btrfs_insert_item()          |
> >>          |- btrfs_reserve_extent() <--|
> >>
> >> Currently, we're using root->ref_cows to determine whether we should do
> >> chunk prealloc to avoid such loop.
> >>
> >> But that's still a hidden trap. Instead of solving it using some hidden
> >> tricks, this patch will make chunk/block group allocation exclusive.
> >>
> >> Now if do_chunk_alloc() determines to alloc chunk, it will set a special
> >> flag in transaction handler so it new do_chunk_alloc() will refuse to
> >> allocate new chunk until current chunk allocation finishes.
> >
> > What if the chunk allocated during the recursion is of a different type?
> > I.e. we're allocating a data chunk, then when inserting the items in the
> > extent, chunk, device, etc trees, we run out of metadata data space and
> > need to allocate a metadata chunk. What you are doing skips the allocation
> > of the metadata chunk and makes the inserts/updates of the trees fail
> > with ENOSPC.
>
> That's why we do preallocation to avoid such problem.

Ok, I wasn't aware about such preallocation in btrfs-progs.

>
> Just as the old code shows, even when we're allocating data chunks, it
> will try to check metadata space first.
>
> And for the profile we're asking, we over-allocate 2M, definitely not
> the best solution but should be enough for extent/chunk tree update.
>
> To make it as good as kernel, we need a lot of accounting which is not
> here in btrfs-progs.
>
> So in your case, every btrfs_reserve_extent() call should either trigger
>  chunk allocation (either metadata or data, or both), or have 2M overhead.
>
> If that 2M can not meet the metadata space requirement for
> chunk/extent/root tree update, then we hit the problem you described.
>
> However 2M looks pretty generous to me, so it should just work.

Yes, that should be enough even for 64Kb nodes and trees with 8 (max) levels.

thanks

>
> Thanks,
> Qu
>
> >
> > thanks
> >
> >
> >
> >>
> >> Signed-off-by: Qu Wenruo <wqu@suse.com>
> >> ---
> >>  check/main.c  |  2 +-
> >>  extent-tree.c | 12 ++++++++++++
> >>  transaction.c |  3 ++-
> >>  transaction.h |  3 ++-
> >>  4 files changed, 17 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/check/main.c b/check/main.c
> >> index 683c322eba6f..121be4b73c4f 100644
> >> --- a/check/main.c
> >> +++ b/check/main.c
> >> @@ -10185,7 +10185,7 @@ int cmd_check(int argc, char **argv)
> >>                         goto close_out;
> >>                 }
> >>
> >> -               trans->reinit_extent_tree = true;
> >> +               trans->reinit_extent_tree = 1;
> >>                 if (init_extent_tree) {
> >>                         printf("Creating a new extent tree\n");
> >>                         ret = reinit_extent_tree(trans, info,
> >> diff --git a/extent-tree.c b/extent-tree.c
> >> index 8c9cdeff3b02..e25ddf02e5cc 100644
> >> --- a/extent-tree.c
> >> +++ b/extent-tree.c
> >> @@ -1873,10 +1873,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
> >>             (flags & BTRFS_BLOCK_GROUP_SYSTEM))
> >>                 return 0;
> >>
> >> +       /*
> >> +        * We're going to allocate new chunk, during the process, we will
> >> +        * allocate new tree blocks, which can trigger new chunk allocation
> >> +        * again.
> >> +        * Avoid such loop call
> >> +        */
> >> +       if (trans->allocating_chunk)
> >> +               return 0;
> >> +       trans->allocating_chunk = 1;
> >> +
> >>         ret = btrfs_alloc_chunk(trans, fs_info, &start, &num_bytes,
> >>                                 space_info->flags);
> >>         if (ret == -ENOSPC) {
> >>                 space_info->full = 1;
> >> +               trans->allocating_chunk = 0;
> >>                 return 0;
> >>         }
> >>
> >> @@ -1885,6 +1896,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
> >>         ret = btrfs_make_block_group(trans, fs_info, 0, space_info->flags,
> >>                                      start, num_bytes);
> >>         BUG_ON(ret);
> >> +       trans->allocating_chunk = 0;
> >>         return 0;
> >>  }
> >>
> >> diff --git a/transaction.c b/transaction.c
> >> index 3a63988b0969..21377282f806 100644
> >> --- a/transaction.c
> >> +++ b/transaction.c
> >> @@ -46,7 +46,8 @@ struct btrfs_trans_handle* btrfs_start_transaction(struct btrfs_root *root,
> >>         fs_info->generation++;
> >>         h->transid = fs_info->generation;
> >>         h->blocks_reserved = num_blocks;
> >> -       h->reinit_extent_tree = false;
> >> +       h->reinit_extent_tree = 0;
> >> +       h->allocating_chunk = 0;
> >>         root->last_trans = h->transid;
> >>         root->commit_root = root->node;
> >>         extent_buffer_get(root->node);
> >> diff --git a/transaction.h b/transaction.h
> >> index 34060252dd5c..b426cd112447 100644
> >> --- a/transaction.h
> >> +++ b/transaction.h
> >> @@ -28,7 +28,8 @@ struct btrfs_trans_handle {
> >>         u64 transid;
> >>         u64 alloc_exclude_start;
> >>         u64 alloc_exclude_nr;
> >> -       bool reinit_extent_tree;
> >> +       unsigned int reinit_extent_tree:1;
> >> +       unsigned int allocating_chunk:1;
> >>         u64 delayed_ref_updates;
> >>         unsigned long blocks_reserved;
> >>         unsigned long blocks_used;
> >> --
> >> 2.21.0
> >>
> >
> >
>

Patch
diff mbox series

diff --git a/check/main.c b/check/main.c
index 683c322eba6f..121be4b73c4f 100644
--- a/check/main.c
+++ b/check/main.c
@@ -10185,7 +10185,7 @@  int cmd_check(int argc, char **argv)
 			goto close_out;
 		}
 
-		trans->reinit_extent_tree = true;
+		trans->reinit_extent_tree = 1;
 		if (init_extent_tree) {
 			printf("Creating a new extent tree\n");
 			ret = reinit_extent_tree(trans, info,
diff --git a/extent-tree.c b/extent-tree.c
index 8c9cdeff3b02..e25ddf02e5cc 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1873,10 +1873,21 @@  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	    (flags & BTRFS_BLOCK_GROUP_SYSTEM))
 		return 0;
 
+	/*
+	 * We're going to allocate new chunk, during the process, we will
+	 * allocate new tree blocks, which can trigger new chunk allocation
+	 * again.
+	 * Avoid such loop call
+	 */
+	if (trans->allocating_chunk)
+		return 0;
+	trans->allocating_chunk = 1;
+
 	ret = btrfs_alloc_chunk(trans, fs_info, &start, &num_bytes,
 	                        space_info->flags);
 	if (ret == -ENOSPC) {
 		space_info->full = 1;
+		trans->allocating_chunk = 0;
 		return 0;
 	}
 
@@ -1885,6 +1896,7 @@  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	ret = btrfs_make_block_group(trans, fs_info, 0, space_info->flags,
 				     start, num_bytes);
 	BUG_ON(ret);
+	trans->allocating_chunk = 0;
 	return 0;
 }
 
diff --git a/transaction.c b/transaction.c
index 3a63988b0969..21377282f806 100644
--- a/transaction.c
+++ b/transaction.c
@@ -46,7 +46,8 @@  struct btrfs_trans_handle* btrfs_start_transaction(struct btrfs_root *root,
 	fs_info->generation++;
 	h->transid = fs_info->generation;
 	h->blocks_reserved = num_blocks;
-	h->reinit_extent_tree = false;
+	h->reinit_extent_tree = 0;
+	h->allocating_chunk = 0;
 	root->last_trans = h->transid;
 	root->commit_root = root->node;
 	extent_buffer_get(root->node);
diff --git a/transaction.h b/transaction.h
index 34060252dd5c..b426cd112447 100644
--- a/transaction.h
+++ b/transaction.h
@@ -28,7 +28,8 @@  struct btrfs_trans_handle {
 	u64 transid;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
-	bool reinit_extent_tree;
+	unsigned int reinit_extent_tree:1;
+	unsigned int allocating_chunk:1;
 	u64 delayed_ref_updates;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;