diff mbox

Btrfs: fix negative qgroup tracking from owner accounting (bug #61951)

Message ID 1382620926-8513-1-git-send-email-list.btrfs@jan-o-sch.net (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Schmidt Oct. 24, 2013, 1:22 p.m. UTC
btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
tracking is based on delayed refs. The owner of a tree block is set when a
tree block is allocated, it is never updated.

When you allocate a tree block and then remove the subvolume that did the
allocation, the qgroup accounting for that removal is correct. However, the
removal was accounted again for each subvolume deletion that also referenced
the tree block, because accounting was erroneously based on the owner.

Instead of queueing delayed refs for the non-existent owner, we now
queue delayed refs for the root being removed. This fixes the qgroup
accounting.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Tested-by: <dustymabe@gmail.com>
---
 fs/btrfs/extent-tree.c |   14 +++++++++-----
 1 files changed, 9 insertions(+), 5 deletions(-)

Comments

Wang Shilong Oct. 24, 2013, 2:49 p.m. UTC | #1
Hello Jan,

> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> tracking is based on delayed refs. The owner of a tree block is set when a
> tree block is allocated, it is never updated.
> 
> When you allocate a tree block and then remove the subvolume that did the
> allocation, the qgroup accounting for that removal is correct. However, the
> removal was accounted again for each subvolume deletion that also referenced
> the tree block, because accounting was erroneously based on the owner.
> 
> Instead of queueing delayed refs for the non-existent owner, we now
> queue delayed refs for the root being removed. This fixes the qgroup
> accounting.

Thanks for tracking this, i apply your patch, and using the flowing patch,
found the problem still exist, the test script like the following:

#!/bin/sh

for i in $(seq 1000)
do
	dd if=/dev/zero of=<mnt>/$i""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  bs=10K count=1
done

btrfs sub snapshot <mnt> <mnt>/1
for i in $(seq 100)
do
	btrfs sub snapshot <mnt>/$i <mnt>/$(($i+1))
done

for i in $(seq 101)
do
	btrfs sub delete <mnt>/$i
done


Thanks,
Wang
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> Tested-by: <dustymabe@gmail.com>
> ---
> fs/btrfs/extent-tree.c |   14 +++++++++-----
> 1 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d58bef1..7846cae 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3004,12 +3004,11 @@ out:
> static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
> 			   struct btrfs_root *root,
> 			   struct extent_buffer *buf,
> -			   int full_backref, int inc, int for_cow)
> +			   int full_backref, u64 ref_root, int inc, int for_cow)
> {
> 	u64 bytenr;
> 	u64 num_bytes;
> 	u64 parent;
> -	u64 ref_root;
> 	u32 nritems;
> 	struct btrfs_key key;
> 	struct btrfs_file_extent_item *fi;
> @@ -3019,7 +3018,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
> 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
> 			    u64, u64, u64, u64, u64, u64, int);
> 
> -	ref_root = btrfs_header_owner(buf);
> 	nritems = btrfs_header_nritems(buf);
> 	level = btrfs_header_level(buf);
> 
> @@ -3075,13 +3073,19 @@ fail:
> int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> 		  struct extent_buffer *buf, int full_backref, int for_cow)
> {
> -	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
> +	u64 ref_root;
> +
> +	ref_root = btrfs_header_owner(buf);
> +
> +	return __btrfs_mod_ref(trans, root, buf, full_backref, ref_root,
> +			       1, for_cow);
> }
> 
> int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> 		  struct extent_buffer *buf, int full_backref, int for_cow)
> {
> -	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
> +	return __btrfs_mod_ref(trans, root, buf, full_backref, root->objectid,
> +			       0, for_cow);
> }
> 
> static int write_one_cache_group(struct btrfs_trans_handle *trans,
> -- 
> 1.7.2.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt Oct. 24, 2013, 3:36 p.m. UTC | #2
On Thu, October 24, 2013 at 16:49 (+0200), Wang Shilong wrote:
> Hello Jan,
> 
>> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
>> tracking is based on delayed refs. The owner of a tree block is set when a
>> tree block is allocated, it is never updated.
>>
>> When you allocate a tree block and then remove the subvolume that did the
>> allocation, the qgroup accounting for that removal is correct. However, the
>> removal was accounted again for each subvolume deletion that also referenced
>> the tree block, because accounting was erroneously based on the owner.
>>
>> Instead of queueing delayed refs for the non-existent owner, we now
>> queue delayed refs for the root being removed. This fixes the qgroup
>> accounting.
> 
> Thanks for tracking this, i apply your patch, and using the flowing patch,
> found the problem still exist, the test script like the following:

Reproduced. Gives more negative numbers due to accounting triggered by the
cleaner thread, that's the common part here. I still believe that the fix I sent
is correct, it's probably not complete. Looking into it.

Thanks,
-Jan

> #!/bin/sh
> 
> for i in $(seq 1000)
> do
> 	dd if=/dev/zero of=<mnt>/$i""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  bs=10K count=1
> done
> 
> btrfs sub snapshot <mnt> <mnt>/1
> for i in $(seq 100)
> do
> 	btrfs sub snapshot <mnt>/$i <mnt>/$(($i+1))
> done
> 
> for i in $(seq 101)
> do
> 	btrfs sub delete <mnt>/$i
> done
> 
> 
> Thanks,
> Wang
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wang Shilong Oct. 25, 2013, 4:08 a.m. UTC | #3
Hello Jan,

On 10/24/2013 11:36 PM, Jan Schmidt wrote:
> On Thu, October 24, 2013 at 16:49 (+0200), Wang Shilong wrote:
>> Hello Jan,
>>
>>> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
>>> tracking is based on delayed refs. The owner of a tree block is set when a
>>> tree block is allocated, it is never updated.
>>>
>>> When you allocate a tree block and then remove the subvolume that did the
>>> allocation, the qgroup accounting for that removal is correct. However, the
>>> removal was accounted again for each subvolume deletion that also referenced
>>> the tree block, because accounting was erroneously based on the owner.
>>>
>>> Instead of queueing delayed refs for the non-existent owner, we now
>>> queue delayed refs for the root being removed. This fixes the qgroup
>>> accounting.
>> Thanks for tracking this, i apply your patch, and using the flowing patch,
>> found the problem still exist, the test script like the following:
> Reproduced. Gives more negative numbers due to accounting triggered by the
> cleaner thread, that's the common part here. I still believe that the fix I sent
> is correct, it's probably not complete. Looking into it.
I really wait cleaner thread to finish work, and i use btrfs-debug-tree 
to confirm
all the fs tree have been deleted.

But using btrfs qgroup show, i still get negative numers, also root 
subvolume's
exclusive is wrong.. Statices are like following.

0/5 13090816 471040
0/257 13078528 0
0/259 13078528 0
0/260 13078528 0
0/261 13078528 0
.........................
........................
.......................
0/350 13078528 0
0/351 13078528 0
0/352 13078528 0
0/353 13078528 0
0/354 13078528 0
0/355 13078528 0
0/356 13078528 0
0/357 13078528 0
0/358 12619776 -155648

Thanks,
Wang
>
> Thanks,
> -Jan
>
>> #!/bin/sh
>>
>> for i in $(seq 1000)
>> do
>> 	dd if=/dev/zero of=<mnt>/$i""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  bs=10K count=1
>> done
>>
>> btrfs sub snapshot <mnt> <mnt>/1
>> for i in $(seq 100)
>> do
>> 	btrfs sub snapshot <mnt>/$i <mnt>/$(($i+1))
>> done
>>
>> for i in $(seq 101)
>> do
>> 	btrfs sub delete <mnt>/$i
>> done
>>
>>
>> Thanks,
>> Wang
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt Nov. 1, 2013, 9:16 a.m. UTC | #4
(cc Arne)

On Thu, October 24, 2013 at 16:49 (+0200), Wang Shilong wrote:
> Hello Jan,
> 
>> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
>> tracking is based on delayed refs. The owner of a tree block is set when a
>> tree block is allocated, it is never updated.
>>
>> When you allocate a tree block and then remove the subvolume that did the
>> allocation, the qgroup accounting for that removal is correct. However, the
>> removal was accounted again for each subvolume deletion that also referenced
>> the tree block, because accounting was erroneously based on the owner.
>>
>> Instead of queueing delayed refs for the non-existent owner, we now
>> queue delayed refs for the root being removed. This fixes the qgroup
>> accounting.
> 
> Thanks for tracking this, i apply your patch, and using the flowing patch,
> found the problem still exist, the test script like the following:
> 
> #!/bin/sh
> 
> for i in $(seq 1000)
> do
> 	dd if=/dev/zero of=<mnt>/$i""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  bs=10K count=1
> done
> 
> btrfs sub snapshot <mnt> <mnt>/1
> for i in $(seq 100)
> do
> 	btrfs sub snapshot <mnt>/$i <mnt>/$(($i+1))
> done
> 
> for i in $(seq 101)
> do
> 	btrfs sub delete <mnt>/$i
> done

I've understood the problem this reproducer creates. In fact, you can shorten it
dramatically. The story of qgroups is going to turn awkward at this point.

mkfs and enable quota, put some data in (needs a level 2 tree)
-> this accounts rfer and excl for qgroup 5

take a snapshot
-> this creates qgroup 257, which gets rfer(257) = rfer(5) and excl(257) = 0,
excl(5) = 0.

now make sure you don't cow anything (which we always did in our extensive
tests), just drop the newly created snapshot.
-> excl(5) ought to become what it was before the snapshot, and there's no code
for this. This is because there is node code that brings rfer(257) to zero, the
data extents are not touched because the tree blocks of 5 and 257 are shared.

Drop tree does not go down the whole tree, when it finds a tree block with
refcnt > 1 it just decrements it and is done. This is very efficient but is bad
the qgroup numbers.

We have got three possibile solutions in mind:

A: Always walk down the whole tree for quota-enabled fs tree drops. Can be done
with the read-ahead code, but is potentially a whole lot of work for large file
systems.

B: Use tracking qgroups as required for several operations on higher level
qgroups also for the level 0 qgroups. They could be created automatically and
track the correct numbers just in case a snapshot is deleted. The problem with
that approach is that it does not scale for a large number of subvolumes, as you
need to track each possible combination of all subvolumes (exponential costs).

C: Make sure all your metadata is cowed before dropping a subvolume. This is
explicitly doing what solution A would do implicitly, but can theoretically be
done by the user. I don't consider C a practical solution.

Sigh.
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt Nov. 1, 2013, 9:19 a.m. UTC | #5
Hi Josef,

please consider this patch for btrfs-next and for the following merge window
(3.13). The fact that there's another problem concerning qgroups as discussed in
the rest of this thread doesn't make this patch any less correct.

Thanks,
-Jan

On Thu, October 24, 2013 at 15:22 (+0200), Jan Schmidt wrote:
> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> tracking is based on delayed refs. The owner of a tree block is set when a
> tree block is allocated, it is never updated.
> 
> When you allocate a tree block and then remove the subvolume that did the
> allocation, the qgroup accounting for that removal is correct. However, the
> removal was accounted again for each subvolume deletion that also referenced
> the tree block, because accounting was erroneously based on the owner.
> 
> Instead of queueing delayed refs for the non-existent owner, we now
> queue delayed refs for the root being removed. This fixes the qgroup
> accounting.
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> Tested-by: <dustymabe@gmail.com>
> ---
>  fs/btrfs/extent-tree.c |   14 +++++++++-----
>  1 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d58bef1..7846cae 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3004,12 +3004,11 @@ out:
>  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
>  			   struct btrfs_root *root,
>  			   struct extent_buffer *buf,
> -			   int full_backref, int inc, int for_cow)
> +			   int full_backref, u64 ref_root, int inc, int for_cow)
>  {
>  	u64 bytenr;
>  	u64 num_bytes;
>  	u64 parent;
> -	u64 ref_root;
>  	u32 nritems;
>  	struct btrfs_key key;
>  	struct btrfs_file_extent_item *fi;
> @@ -3019,7 +3018,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
>  	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
>  			    u64, u64, u64, u64, u64, u64, int);
>  
> -	ref_root = btrfs_header_owner(buf);
>  	nritems = btrfs_header_nritems(buf);
>  	level = btrfs_header_level(buf);
>  
> @@ -3075,13 +3073,19 @@ fail:
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref, int for_cow)
>  {
> -	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
> +	u64 ref_root;
> +
> +	ref_root = btrfs_header_owner(buf);
> +
> +	return __btrfs_mod_ref(trans, root, buf, full_backref, ref_root,
> +			       1, for_cow);
>  }
>  
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref, int for_cow)
>  {
> -	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
> +	return __btrfs_mod_ref(trans, root, buf, full_backref, root->objectid,
> +			       0, for_cow);
>  }
>  
>  static int write_one_cache_group(struct btrfs_trans_handle *trans,
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Nov. 1, 2013, 12:42 p.m. UTC | #6
On Fri, Nov 01, 2013 at 10:16:37AM +0100, Jan Schmidt wrote:
> (cc Arne)
> 
> On Thu, October 24, 2013 at 16:49 (+0200), Wang Shilong wrote:
> > Hello Jan,
> > 
> >> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> >> tracking is based on delayed refs. The owner of a tree block is set when a
> >> tree block is allocated, it is never updated.
> >>
> >> When you allocate a tree block and then remove the subvolume that did the
> >> allocation, the qgroup accounting for that removal is correct. However, the
> >> removal was accounted again for each subvolume deletion that also referenced
> >> the tree block, because accounting was erroneously based on the owner.
> >>
> >> Instead of queueing delayed refs for the non-existent owner, we now
> >> queue delayed refs for the root being removed. This fixes the qgroup
> >> accounting.
> > 
> > Thanks for tracking this, i apply your patch, and using the flowing patch,
> > found the problem still exist, the test script like the following:
> > 
> > #!/bin/sh
> > 
> > for i in $(seq 1000)
> > do
> > 	dd if=/dev/zero of=<mnt>/$i""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa  bs=10K count=1
> > done
> > 
> > btrfs sub snapshot <mnt> <mnt>/1
> > for i in $(seq 100)
> > do
> > 	btrfs sub snapshot <mnt>/$i <mnt>/$(($i+1))
> > done
> > 
> > for i in $(seq 101)
> > do
> > 	btrfs sub delete <mnt>/$i
> > done
> 
> I've understood the problem this reproducer creates. In fact, you can shorten it
> dramatically. The story of qgroups is going to turn awkward at this point.
> 
> mkfs and enable quota, put some data in (needs a level 2 tree)
> -> this accounts rfer and excl for qgroup 5
> 
> take a snapshot
> -> this creates qgroup 257, which gets rfer(257) = rfer(5) and excl(257) = 0,
> excl(5) = 0.
> 
> now make sure you don't cow anything (which we always did in our extensive
> tests), just drop the newly created snapshot.
> -> excl(5) ought to become what it was before the snapshot, and there's no code
> for this. This is because there is node code that brings rfer(257) to zero, the
> data extents are not touched because the tree blocks of 5 and 257 are shared.
> 
> Drop tree does not go down the whole tree, when it finds a tree block with
> refcnt > 1 it just decrements it and is done. This is very efficient but is bad
> the qgroup numbers.
> 
> We have got three possibile solutions in mind:
> 
> A: Always walk down the whole tree for quota-enabled fs tree drops. Can be done
> with the read-ahead code, but is potentially a whole lot of work for large file
> systems.
> 

No.

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Nov. 1, 2013, 3:07 p.m. UTC | #7
On Thu, Oct 24, 2013 at 03:22:06PM +0200, Jan Schmidt wrote:
> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> tracking is based on delayed refs. The owner of a tree block is set when a
> tree block is allocated, it is never updated.
> 
> When you allocate a tree block and then remove the subvolume that did the
> allocation, the qgroup accounting for that removal is correct. However, the
> removal was accounted again for each subvolume deletion that also referenced
> the tree block, because accounting was erroneously based on the owner.
> 
> Instead of queueing delayed refs for the non-existent owner, we now
> queue delayed refs for the root being removed. This fixes the qgroup
> accounting.
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> Tested-by: <dustymabe@gmail.com>

I wasn't able to reproduce this as it was written in the bugzilla, I assume you
got a consistent reproducer so please push it to xfstests.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wang Shilong Nov. 2, 2013, 4:35 a.m. UTC | #8
Hello Jan, Arne

On 11/01/2013 05:16 PM, Jan Schmidt wrote:
> I've understood the problem this reproducer creates. In fact, you can shorten it
> dramatically. The story of qgroups is going to turn awkward at this point.
>
> mkfs and enable quota, put some data in (needs a level 2 tree)
> -> this accounts rfer and excl for qgroup 5
>
> take a snapshot
> -> this creates qgroup 257, which gets rfer(257) = rfer(5) and excl(257) = 0,
> excl(5) = 0.
>
> now make sure you don't cow anything (which we always did in our extensive
> tests), just drop the newly created snapshot.
> -> excl(5) ought to become what it was before the snapshot, and there's no code
> for this. This is because there is node code that brings rfer(257) to zero, the
> data extents are not touched because the tree blocks of 5 and 257 are shared.
>
> Drop tree does not go down the whole tree, when it finds a tree block with
> refcnt > 1 it just decrements it and is done. This is very efficient but is bad
> the qgroup numbers.
>
> We have got three possibile solutions in mind:
>
> A: Always walk down the whole tree for quota-enabled fs tree drops. Can be done
> with the read-ahead code, but is potentially a whole lot of work for large file
> systems.
>
> B: Use tracking qgroups as required for several operations on higher level
> qgroups also for the level 0 qgroups. They could be created automatically and
> track the correct numbers just in case a snapshot is deleted. The problem with
> that approach is that it does not scale for a large number of subvolumes, as you
> need to track each possible combination of all subvolumes (exponential costs).
>
> C: Make sure all your metadata is cowed before dropping a subvolume. This is
> explicitly doing what solution A would do implicitly, but can theoretically be
> done by the user. I don't consider C a practical solution.
Qgroup's exclusive size is an important feature to know a subvolume's 
sole size.
However, it really brings a lot of problems.

1> To differ refer and exclusive size, we have to walk backref to find 
all root
for a backref in a point,find_all_root() can slow down btrfs if there 
are a lot of
snapshots...

2> some people complain that with qgroup enabled, system memory cost
become extremely high, this maybe related to qgroup tracking for delayed
refs.

3> Deleting a subvolume/Snapshot can make btrfs qgroup tracking wrong,
we haven't found an effective way to solve this problem.

So maybe we should remove qgroup's exclusive or add an option to disable
qgroup's exclusive size, this will make life easier, considering:

1> we don't have to walk backref, calling find_all_root() will be avoided.

2> system memory high cost maybe be avoided.

3> When deleting a subvolume, we just destroy its qgroup.

If there are no objections against it, i'd like to add it my todo list.:-P

Thanks,
Wang
> Sigh.
> -Jan
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message tomajordomo@vger.kernel.org
> More majordomo info athttp://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Nov. 4, 2013, 5:42 p.m. UTC | #9
On Thu, Oct 24, 2013 at 03:22:06PM +0200, Jan Schmidt wrote:
> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> tracking is based on delayed refs. The owner of a tree block is set when a
> tree block is allocated, it is never updated.
> 
> When you allocate a tree block and then remove the subvolume that did the
> allocation, the qgroup accounting for that removal is correct. However, the
> removal was accounted again for each subvolume deletion that also referenced
> the tree block, because accounting was erroneously based on the owner.
> 
> Instead of queueing delayed refs for the non-existent owner, we now
> queue delayed refs for the root being removed. This fixes the qgroup
> accounting.
> 
> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> Tested-by: <dustymabe@gmail.com>

This breaks btrfs/003, I'm kicking it out.

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt Nov. 6, 2013, 5:20 p.m. UTC | #10
On Mon, November 04, 2013 at 18:42 (+0100), Josef Bacik wrote:
> On Thu, Oct 24, 2013 at 03:22:06PM +0200, Jan Schmidt wrote:
>> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
>> tracking is based on delayed refs. The owner of a tree block is set when a
>> tree block is allocated, it is never updated.
>>
>> When you allocate a tree block and then remove the subvolume that did the
>> allocation, the qgroup accounting for that removal is correct. However, the
>> removal was accounted again for each subvolume deletion that also referenced
>> the tree block, because accounting was erroneously based on the owner.
>>
>> Instead of queueing delayed refs for the non-existent owner, we now
>> queue delayed refs for the root being removed. This fixes the qgroup
>> accounting.
>>
>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>> Tested-by: <dustymabe@gmail.com>
> 
> This breaks btrfs/003, I'm kicking it out.

Can you be a bit more specific? Works fine here.

-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Nov. 6, 2013, 5:34 p.m. UTC | #11
On Wed, Nov 06, 2013 at 06:20:47PM +0100, Jan Schmidt wrote:
>  
> On Mon, November 04, 2013 at 18:42 (+0100), Josef Bacik wrote:
> > On Thu, Oct 24, 2013 at 03:22:06PM +0200, Jan Schmidt wrote:
> >> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
> >> tracking is based on delayed refs. The owner of a tree block is set when a
> >> tree block is allocated, it is never updated.
> >>
> >> When you allocate a tree block and then remove the subvolume that did the
> >> allocation, the qgroup accounting for that removal is correct. However, the
> >> removal was accounted again for each subvolume deletion that also referenced
> >> the tree block, because accounting was erroneously based on the owner.
> >>
> >> Instead of queueing delayed refs for the non-existent owner, we now
> >> queue delayed refs for the root being removed. This fixes the qgroup
> >> accounting.
> >>
> >> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> >> Tested-by: <dustymabe@gmail.com>
> > 
> > This breaks btrfs/003, I'm kicking it out.
> 
> Can you be a bit more specific? Works fine here.
> 

It's blowing up on the balance, so maybe make a bigger fs and balance that so
you can see it?  It's exploding in __btrfs_free_extent because we can't find the
ref we're trying to drop, so I assume you've broken the dropping of the reloc
root part where it depends on you giving it btrfs_header_owner() instead of
root->root_key.objectid as when we cow blocks that belong to the reloc root we
leave the owner set to whoever owned the block originally and not the reloc
root.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wang Shilong Nov. 7, 2013, 1:33 a.m. UTC | #12
On 11/07/2013 01:34 AM, Josef Bacik wrote:
> On Wed, Nov 06, 2013 at 06:20:47PM +0100, Jan Schmidt wrote:
>>   
>> On Mon, November 04, 2013 at 18:42 (+0100), Josef Bacik wrote:
>>> On Thu, Oct 24, 2013 at 03:22:06PM +0200, Jan Schmidt wrote:
>>>> btrfs_dec_ref() queued a delayed ref for owner of a tree block. The qgroup
>>>> tracking is based on delayed refs. The owner of a tree block is set when a
>>>> tree block is allocated, it is never updated.
>>>>
>>>> When you allocate a tree block and then remove the subvolume that did the
>>>> allocation, the qgroup accounting for that removal is correct. However, the
>>>> removal was accounted again for each subvolume deletion that also referenced
>>>> the tree block, because accounting was erroneously based on the owner.
>>>>
>>>> Instead of queueing delayed refs for the non-existent owner, we now
>>>> queue delayed refs for the root being removed. This fixes the qgroup
>>>> accounting.
>>>>
>>>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>>>> Tested-by: <dustymabe@gmail.com>
>>> This breaks btrfs/003, I'm kicking it out.
>> Can you be a bit more specific? Works fine here.
>>
> It's blowing up on the balance, so maybe make a bigger fs and balance that so
> you can see it?  It's exploding in __btrfs_free_extent because we can't find the
> ref we're trying to drop, so I assume you've broken the dropping of the reloc
> root part where it depends on you giving it btrfs_header_owner() instead of
> root->root_key.objectid as when we cow blocks that belong to the reloc root we
> leave the owner set to whoever owned the block originally and not the reloc
> root.  Thanks,
True, i did hit the problem when i test balance regression with btrfs-next.

[ 8468.976315] WARNING: CPU: 3 PID: 14993 at fs/btrfs/extent-tree.c:5783 
__btrfs_free_extent+0x9af/0xa20 [btrfs]()
[ 8468.976316] Modules linked in: btrfs(O) ebtable_nat ip6t_REJECT tun 
xt_CHECKSUM fuse bridge stp llc bluetooth rfkill iptable_mangle 
nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack 
ebtable_filter ebtables ip6table_filter ip6_tables libcrc32c xor 
zlib_deflate raid6_pq r8169 snd_hda_codec_realtek snd_hda_intel 
snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm wmi iTCO_wdt 
iTCO_vendor_support lpc_ich uinput snd_page_alloc snd_timer i2c_i801 snd 
soundcore acpi_cpufreq mperf mfd_core mii pcspkr i915 i2c_algo_bit 
drm_kms_helper drm i2c_core video [last unloaded: btrfs]
[ 8468.976339] CPU: 3 PID: 14993 Comm: btrfs Tainted: G           O 
3.11.0+ #17
[ 8468.976340] Hardware name: LENOVO QiTianM4350/ , BIOS F1KT52AUS 
05/24/2013
[ 8468.976341]  0000000000000009 ffff880087c11828 ffffffff8158f948 
0000000000000000
[ 8468.976343]  ffff880087c11860 ffffffff8104febd ffff8800ba554ab0 
000000001b64c000
[ 8468.976344]  0000000000000000 00000000fffffffe 0000000000000000 
ffff880087c11870
[ 8468.976346] Call Trace:
[ 8468.976351]  [<ffffffff8158f948>] dump_stack+0x45/0x56
[ 8468.976354]  [<ffffffff8104febd>] warn_slowpath_common+0x7d/0xa0
[ 8468.976356]  [<ffffffff8104ff9a>] warn_slowpath_null+0x1a/0x20
[ 8468.976361]  [<ffffffffa0501c3f>] __btrfs_free_extent+0x9af/0xa20 [btrfs]
[ 8468.976366]  [<ffffffffa04f4000>] ? btrfs_free_path+0x20/0x30 [btrfs]
[ 8468.976375]  [<ffffffffa055fcf4>] ? 
btrfs_merge_delayed_refs+0x1f4/0x3d0 [btrfs]
[ 8468.976380]  [<ffffffffa05061ac>] run_clustered_refs+0x46c/0x1080 [btrfs]
[ 8468.976386]  [<ffffffffa050aae0>] btrfs_run_delayed_refs+0xe0/0x540 
[btrfs]
[ 8468.976393]  [<ffffffffa0562ac3>] ? 
alloc_backref_node.isra.14+0x23/0x60 [btrfs]
[ 8468.976399]  [<ffffffffa0560910>] ? tree_insert+0x50/0x60 [btrfs]
[ 8468.976406]  [<ffffffffa0568229>] ? 
btrfs_reloc_post_snapshot+0xf9/0x360 [btrfs]
[ 8468.976412]  [<ffffffffa05198b6>] create_pending_snapshot+0x706/0x920 
[btrfs]
[ 8468.976418]  [<ffffffffa0519b3a>] create_pending_snapshots+0x6a/0x90 
[btrfs]
[ 8468.976424]  [<ffffffffa051afd4>] 
btrfs_commit_transaction+0x384/0x970 [btrfs]
[ 8468.976431]  [<ffffffffa054b22a>] btrfs_mksubvol.isra.30+0x3da/0x460 
[btrfs]
[ 8468.976437]  [<ffffffffa054b38e>] 
btrfs_ioctl_snap_create_transid+0xde/0x170 [btrfs]
[ 8468.976443]  [<ffffffffa054b57b>] 
btrfs_ioctl_snap_create_v2+0xeb/0x130 [btrfs]
[ 8468.976450]  [<ffffffffa054de9a>] btrfs_ioctl+0x11da/0x27a0 [btrfs]
[ 8468.976453]  [<ffffffff8113a420>] ? handle_mm_fault+0x210/0x310
[ 8468.976456]  [<ffffffff815997f4>] ? __do_page_fault+0x1f4/0x500
[ 8468.976457]  [<ffffffff81140995>] ? do_mmap_pgoff+0x305/0x3c0
[ 8468.976460]  [<ffffffff8117dd9d>] do_vfs_ioctl+0x2dd/0x4b0
[ 8468.976461]  [<ffffffff8117dff1>] SyS_ioctl+0x81/0xa0
[ 8468.976463]  [<ffffffff81599b0e>] ? do_page_fault+0xe/0x10
[ 8468.976465]  [<ffffffff8159e202>] system_call_fastpath+0x16/0x1b

Thanks,
Wang
>
> Josef
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d58bef1..7846cae 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3004,12 +3004,11 @@  out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc, int for_cow)
+			   int full_backref, u64 ref_root, int inc, int for_cow)
 {
 	u64 bytenr;
 	u64 num_bytes;
 	u64 parent;
-	u64 ref_root;
 	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
@@ -3019,7 +3018,6 @@  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
 			    u64, u64, u64, u64, u64, u64, int);
 
-	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
@@ -3075,13 +3073,19 @@  fail:
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+	u64 ref_root;
+
+	ref_root = btrfs_header_owner(buf);
+
+	return __btrfs_mod_ref(trans, root, buf, full_backref, ref_root,
+			       1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, root->objectid,
+			       0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,