diff mbox

[v3,2/3] btrfs: relocation: Fix leaking qgroups numbers on data extents

Message ID 20160809083021.12612-3-quwenruo@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show

Commit Message

Qu Wenruo Aug. 9, 2016, 8:30 a.m. UTC
When balancing data extents, qgroup will leak all its numbers for
relocated data extents.

The relocation is done in the following steps for data extents:
1) Create data reloc tree and inode
2) Copy all data extents to data reloc tree
   And commit transaction
3) Create tree reloc tree(special snapshot) for any related subvolumes
4) Replace file extent in tree reloc tree with new extents in data reloc
   tree
   And commit transaction
5) Merge tree reloc tree with original fs, by swapping tree blocks

For 1)~4), since tree reloc tree and data reloc tree doesn't count to
qgroup, everything is OK.

But for 5), the swapping of tree blocks will only info qgroup to track
metadata extents.

If metadata extents contain file extents, qgroup number for file extents
will get lost, leading to corrupted qgroup accounting.

The fix is, before commit transaction of step 5), manually info qgroup to
track all file extents in data reloc tree.
Since at commit transaction time, the tree swapping is done, and qgroup
will account these data extents correctly.

Cc: Mark Fasheh <mfasheh@suse.de>
Reported-by: Mark Fasheh <mfasheh@suse.de>
Reported-by: Filipe Manana <fdmanana@gmail.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/relocation.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 108 insertions(+), 6 deletions(-)

Comments

Goldwyn Rodrigues Aug. 9, 2016, 1:21 p.m. UTC | #1
On 08/09/2016 03:30 AM, Qu Wenruo wrote:
> When balancing data extents, qgroup will leak all its numbers for
> relocated data extents.
> 
> The relocation is done in the following steps for data extents:
> 1) Create data reloc tree and inode
> 2) Copy all data extents to data reloc tree
>    And commit transaction
> 3) Create tree reloc tree(special snapshot) for any related subvolumes
> 4) Replace file extent in tree reloc tree with new extents in data reloc
>    tree
>    And commit transaction
> 5) Merge tree reloc tree with original fs, by swapping tree blocks
> 
> For 1)~4), since tree reloc tree and data reloc tree doesn't count to
> qgroup, everything is OK.
> 
> But for 5), the swapping of tree blocks will only info qgroup to track
> metadata extents.
> 
> If metadata extents contain file extents, qgroup number for file extents
> will get lost, leading to corrupted qgroup accounting.
> 
> The fix is, before commit transaction of step 5), manually info qgroup to
> track all file extents in data reloc tree.
> Since at commit transaction time, the tree swapping is done, and qgroup
> will account these data extents correctly.
> 
> Cc: Mark Fasheh <mfasheh@suse.de>
> Reported-by: Mark Fasheh <mfasheh@suse.de>
> Reported-by: Filipe Manana <fdmanana@gmail.com>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>

Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>

> ---
>  fs/btrfs/relocation.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 108 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index b26a5ae..a6ace8a 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -31,6 +31,7 @@
>  #include "async-thread.h"
>  #include "free-space-cache.h"
>  #include "inode-map.h"
> +#include "qgroup.h"
>  
>  /*
>   * backref_node, mapping_node and tree_block start with this
> @@ -3916,6 +3917,95 @@ int prepare_to_relocate(struct reloc_control *rc)
>  	return 0;
>  }
>  
> +/*
> + * Qgroup fixer for data chunk relocation.
> + * The data relocation is done in the following steps
> + * 1) Copy data extents into data reloc tree
> + * 2) Create tree reloc tree(special snapshot) for related subvolumes
> + * 3) Modify file extents in tree reloc tree
> + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
> + *
> + * The problem is, data and tree reloc tree are not accounted to qgroup,
> + * and 4) will only info qgroup to track tree blocks change, not file extents
> + * in the tree blocks.
> + *
> + * The good news is, related data extents are all in data reloc tree, so we
> + * only need to info qgroup to track all file extents in data reloc tree
> + * before commit trans.
> + */
> +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
> +					     struct reloc_control *rc)
> +{
> +	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
> +	struct inode *inode = rc->data_inode;
> +	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
> +	struct btrfs_path *path;
> +	struct btrfs_key key;
> +	int ret = 0;
> +
> +	if (!fs_info->quota_enabled)
> +		return 0;
> +
> +	/*
> +	 * Only for stage where we update data pointers the qgroup fix is
> +	 * valid.
> +	 * For MOVING_DATA stage, we will miss the timing of swapping tree
> +	 * blocks, and won't fix it.
> +	 */
> +	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
> +		return 0;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +	key.objectid = btrfs_ino(inode);
> +	key.type = BTRFS_EXTENT_DATA_KEY;
> +	key.offset = 0;
> +
> +	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto out;
> +
> +	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
> +	while (1) {
> +		struct btrfs_file_extent_item *fi;
> +
> +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> +		if (key.objectid > btrfs_ino(inode))
> +			break;
> +		if (key.type != BTRFS_EXTENT_DATA_KEY)
> +			goto next;
> +		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +				    struct btrfs_file_extent_item);
> +		if (btrfs_file_extent_type(path->nodes[0], fi) !=
> +				BTRFS_FILE_EXTENT_REG)
> +			goto next;
> +		/*
> +		pr_info("disk bytenr: %llu, num_bytes: %llu\n",
> +			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
> +			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi));
> +			*/
> +		ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
> +			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
> +			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
> +			GFP_NOFS);
> +		if (ret < 0)
> +			break;
> +next:
> +		ret = btrfs_next_item(data_reloc_root, path);
> +		if (ret < 0)
> +			break;
> +		if (ret > 0) {
> +			ret = 0;
> +			break;
> +		}
> +	}
> +	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
>  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
>  {
>  	struct rb_root blocks = RB_ROOT;
> @@ -4102,10 +4192,16 @@ restart:
>  
>  	/* get rid of pinned extents */
>  	trans = btrfs_join_transaction(rc->extent_root);
> -	if (IS_ERR(trans))
> +	if (IS_ERR(trans)) {
>  		err = PTR_ERR(trans);
> -	else
> -		btrfs_commit_transaction(trans, rc->extent_root);
> +		goto out_free;
> +	}
> +	err = qgroup_fix_relocated_data_extents(trans, rc);
> +	if (err < 0) {
> +		btrfs_abort_transaction(trans, err);
> +		goto out_free;
> +	}
> +	btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>  	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
>  	btrfs_free_path(path);
> @@ -4468,10 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
>  	unset_reloc_control(rc);
>  
>  	trans = btrfs_join_transaction(rc->extent_root);
> -	if (IS_ERR(trans))
> +	if (IS_ERR(trans)) {
>  		err = PTR_ERR(trans);
> -	else
> -		err = btrfs_commit_transaction(trans, rc->extent_root);
> +		goto out_free;
> +	}
> +	err = qgroup_fix_relocated_data_extents(trans, rc);
> +	if (err < 0) {
> +		btrfs_abort_transaction(trans, err);
> +		goto out_free;
> +	}
> +	err = btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>  	kfree(rc);
>  out:
>
Filipe Manana Aug. 12, 2016, 1:33 p.m. UTC | #2
On Tue, Aug 9, 2016 at 9:30 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
> When balancing data extents, qgroup will leak all its numbers for
> relocated data extents.
>
> The relocation is done in the following steps for data extents:
> 1) Create data reloc tree and inode
> 2) Copy all data extents to data reloc tree
>    And commit transaction
> 3) Create tree reloc tree(special snapshot) for any related subvolumes
> 4) Replace file extent in tree reloc tree with new extents in data reloc
>    tree
>    And commit transaction
> 5) Merge tree reloc tree with original fs, by swapping tree blocks
>
> For 1)~4), since tree reloc tree and data reloc tree doesn't count to
> qgroup, everything is OK.
>
> But for 5), the swapping of tree blocks will only info qgroup to track
> metadata extents.
>
> If metadata extents contain file extents, qgroup number for file extents
> will get lost, leading to corrupted qgroup accounting.
>
> The fix is, before commit transaction of step 5), manually info qgroup to
> track all file extents in data reloc tree.
> Since at commit transaction time, the tree swapping is done, and qgroup
> will account these data extents correctly.

Hi Qu,

This changelog should mention this fixes a regression introduced in
the 4.2 kernel.
It's specially important for people responsible to backport fixes to
earlier kernel releases.

>
> Cc: Mark Fasheh <mfasheh@suse.de>
> Reported-by: Mark Fasheh <mfasheh@suse.de>
> Reported-by: Filipe Manana <fdmanana@gmail.com>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/relocation.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 108 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index b26a5ae..a6ace8a 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -31,6 +31,7 @@
>  #include "async-thread.h"
>  #include "free-space-cache.h"
>  #include "inode-map.h"
> +#include "qgroup.h"
>
>  /*
>   * backref_node, mapping_node and tree_block start with this
> @@ -3916,6 +3917,95 @@ int prepare_to_relocate(struct reloc_control *rc)
>         return 0;
>  }
>
> +/*
> + * Qgroup fixer for data chunk relocation.
> + * The data relocation is done in the following steps
> + * 1) Copy data extents into data reloc tree
> + * 2) Create tree reloc tree(special snapshot) for related subvolumes
> + * 3) Modify file extents in tree reloc tree
> + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
> + *
> + * The problem is, data and tree reloc tree are not accounted to qgroup,
> + * and 4) will only info qgroup to track tree blocks change, not file extents
> + * in the tree blocks.
> + *
> + * The good news is, related data extents are all in data reloc tree, so we
> + * only need to info qgroup to track all file extents in data reloc tree
> + * before commit trans.
> + */
> +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
> +                                            struct reloc_control *rc)
> +{
> +       struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
> +       struct inode *inode = rc->data_inode;
> +       struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
> +       struct btrfs_path *path;
> +       struct btrfs_key key;
> +       int ret = 0;
> +
> +       if (!fs_info->quota_enabled)
> +               return 0;
> +
> +       /*
> +        * Only for stage where we update data pointers the qgroup fix is
> +        * valid.
> +        * For MOVING_DATA stage, we will miss the timing of swapping tree
> +        * blocks, and won't fix it.
> +        */
> +       if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
> +               return 0;
> +
> +       path = btrfs_alloc_path();
> +       if (!path)
> +               return -ENOMEM;
> +       key.objectid = btrfs_ino(inode);
> +       key.type = BTRFS_EXTENT_DATA_KEY;
> +       key.offset = 0;
> +
> +       ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
> +       if (ret < 0)
> +               goto out;
> +
> +       lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
> +       while (1) {
> +               struct btrfs_file_extent_item *fi;
> +
> +               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> +               if (key.objectid > btrfs_ino(inode))
> +                       break;
> +               if (key.type != BTRFS_EXTENT_DATA_KEY)
> +                       goto next;
> +               fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
> +                                   struct btrfs_file_extent_item);
> +               if (btrfs_file_extent_type(path->nodes[0], fi) !=
> +                               BTRFS_FILE_EXTENT_REG)
> +                       goto next;
> +               /*
> +               pr_info("disk bytenr: %llu, num_bytes: %llu\n",
> +                       btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
> +                       btrfs_file_extent_disk_num_bytes(path->nodes[0], fi));
> +                       */

Please remove this debugging pr_info.

> +               ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
> +                       btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
> +                       btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
> +                       GFP_NOFS);
> +               if (ret < 0)
> +                       break;
> +next:
> +               ret = btrfs_next_item(data_reloc_root, path);
> +               if (ret < 0)
> +                       break;
> +               if (ret > 0) {
> +                       ret = 0;
> +                       break;
> +               }
> +       }
> +       unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
> +out:
> +       btrfs_free_path(path);
> +       return ret;
> +}
> +
>  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
>  {
>         struct rb_root blocks = RB_ROOT;
> @@ -4102,10 +4192,16 @@ restart:
>
>         /* get rid of pinned extents */
>         trans = btrfs_join_transaction(rc->extent_root);
> -       if (IS_ERR(trans))
> +       if (IS_ERR(trans)) {
>                 err = PTR_ERR(trans);
> -       else
> -               btrfs_commit_transaction(trans, rc->extent_root);
> +               goto out_free;
> +       }
> +       err = qgroup_fix_relocated_data_extents(trans, rc);
> +       if (err < 0) {
> +               btrfs_abort_transaction(trans, err);
> +               goto out_free;
> +       }
> +       btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>         btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
>         btrfs_free_path(path);
> @@ -4468,10 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
>         unset_reloc_control(rc);
>
>         trans = btrfs_join_transaction(rc->extent_root);
> -       if (IS_ERR(trans))
> +       if (IS_ERR(trans)) {
>                 err = PTR_ERR(trans);
> -       else
> -               err = btrfs_commit_transaction(trans, rc->extent_root);
> +               goto out_free;
> +       }
> +       err = qgroup_fix_relocated_data_extents(trans, rc);
> +       if (err < 0) {
> +               btrfs_abort_transaction(trans, err);
> +               goto out_free;
> +       }
> +       err = btrfs_commit_transaction(trans, rc->extent_root);
>  out_free:
>         kfree(rc);
>  out:
> --
> 2.9.2
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Qu Wenruo Aug. 15, 2016, 2:04 a.m. UTC | #3
At 08/12/2016 09:33 PM, Filipe Manana wrote:
> On Tue, Aug 9, 2016 at 9:30 AM, Qu Wenruo <quwenruo@cn.fujitsu.com> wrote:
>> When balancing data extents, qgroup will leak all its numbers for
>> relocated data extents.
>>
>> The relocation is done in the following steps for data extents:
>> 1) Create data reloc tree and inode
>> 2) Copy all data extents to data reloc tree
>>    And commit transaction
>> 3) Create tree reloc tree(special snapshot) for any related subvolumes
>> 4) Replace file extent in tree reloc tree with new extents in data reloc
>>    tree
>>    And commit transaction
>> 5) Merge tree reloc tree with original fs, by swapping tree blocks
>>
>> For 1)~4), since tree reloc tree and data reloc tree doesn't count to
>> qgroup, everything is OK.
>>
>> But for 5), the swapping of tree blocks will only info qgroup to track
>> metadata extents.
>>
>> If metadata extents contain file extents, qgroup number for file extents
>> will get lost, leading to corrupted qgroup accounting.
>>
>> The fix is, before commit transaction of step 5), manually info qgroup to
>> track all file extents in data reloc tree.
>> Since at commit transaction time, the tree swapping is done, and qgroup
>> will account these data extents correctly.
>
> Hi Qu,
>
> This changelog should mention this fixes a regression introduced in
> the 4.2 kernel.
> It's specially important for people responsible to backport fixes to
> earlier kernel releases.

Thanks, I'll update this patch and remove the pr_info in next version.

Thanks,
Qu
>
>>
>> Cc: Mark Fasheh <mfasheh@suse.de>
>> Reported-by: Mark Fasheh <mfasheh@suse.de>
>> Reported-by: Filipe Manana <fdmanana@gmail.com>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/relocation.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 108 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index b26a5ae..a6ace8a 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -31,6 +31,7 @@
>>  #include "async-thread.h"
>>  #include "free-space-cache.h"
>>  #include "inode-map.h"
>> +#include "qgroup.h"
>>
>>  /*
>>   * backref_node, mapping_node and tree_block start with this
>> @@ -3916,6 +3917,95 @@ int prepare_to_relocate(struct reloc_control *rc)
>>         return 0;
>>  }
>>
>> +/*
>> + * Qgroup fixer for data chunk relocation.
>> + * The data relocation is done in the following steps
>> + * 1) Copy data extents into data reloc tree
>> + * 2) Create tree reloc tree(special snapshot) for related subvolumes
>> + * 3) Modify file extents in tree reloc tree
>> + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
>> + *
>> + * The problem is, data and tree reloc tree are not accounted to qgroup,
>> + * and 4) will only info qgroup to track tree blocks change, not file extents
>> + * in the tree blocks.
>> + *
>> + * The good news is, related data extents are all in data reloc tree, so we
>> + * only need to info qgroup to track all file extents in data reloc tree
>> + * before commit trans.
>> + */
>> +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
>> +                                            struct reloc_control *rc)
>> +{
>> +       struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
>> +       struct inode *inode = rc->data_inode;
>> +       struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
>> +       struct btrfs_path *path;
>> +       struct btrfs_key key;
>> +       int ret = 0;
>> +
>> +       if (!fs_info->quota_enabled)
>> +               return 0;
>> +
>> +       /*
>> +        * Only for stage where we update data pointers the qgroup fix is
>> +        * valid.
>> +        * For MOVING_DATA stage, we will miss the timing of swapping tree
>> +        * blocks, and won't fix it.
>> +        */
>> +       if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
>> +               return 0;
>> +
>> +       path = btrfs_alloc_path();
>> +       if (!path)
>> +               return -ENOMEM;
>> +       key.objectid = btrfs_ino(inode);
>> +       key.type = BTRFS_EXTENT_DATA_KEY;
>> +       key.offset = 0;
>> +
>> +       ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
>> +       if (ret < 0)
>> +               goto out;
>> +
>> +       lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
>> +       while (1) {
>> +               struct btrfs_file_extent_item *fi;
>> +
>> +               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
>> +               if (key.objectid > btrfs_ino(inode))
>> +                       break;
>> +               if (key.type != BTRFS_EXTENT_DATA_KEY)
>> +                       goto next;
>> +               fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
>> +                                   struct btrfs_file_extent_item);
>> +               if (btrfs_file_extent_type(path->nodes[0], fi) !=
>> +                               BTRFS_FILE_EXTENT_REG)
>> +                       goto next;
>> +               /*
>> +               pr_info("disk bytenr: %llu, num_bytes: %llu\n",
>> +                       btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
>> +                       btrfs_file_extent_disk_num_bytes(path->nodes[0], fi));
>> +                       */
>
> Please remove this debugging pr_info.
>
>> +               ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
>> +                       btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
>> +                       btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
>> +                       GFP_NOFS);
>> +               if (ret < 0)
>> +                       break;
>> +next:
>> +               ret = btrfs_next_item(data_reloc_root, path);
>> +               if (ret < 0)
>> +                       break;
>> +               if (ret > 0) {
>> +                       ret = 0;
>> +                       break;
>> +               }
>> +       }
>> +       unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
>> +out:
>> +       btrfs_free_path(path);
>> +       return ret;
>> +}
>> +
>>  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
>>  {
>>         struct rb_root blocks = RB_ROOT;
>> @@ -4102,10 +4192,16 @@ restart:
>>
>>         /* get rid of pinned extents */
>>         trans = btrfs_join_transaction(rc->extent_root);
>> -       if (IS_ERR(trans))
>> +       if (IS_ERR(trans)) {
>>                 err = PTR_ERR(trans);
>> -       else
>> -               btrfs_commit_transaction(trans, rc->extent_root);
>> +               goto out_free;
>> +       }
>> +       err = qgroup_fix_relocated_data_extents(trans, rc);
>> +       if (err < 0) {
>> +               btrfs_abort_transaction(trans, err);
>> +               goto out_free;
>> +       }
>> +       btrfs_commit_transaction(trans, rc->extent_root);
>>  out_free:
>>         btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
>>         btrfs_free_path(path);
>> @@ -4468,10 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
>>         unset_reloc_control(rc);
>>
>>         trans = btrfs_join_transaction(rc->extent_root);
>> -       if (IS_ERR(trans))
>> +       if (IS_ERR(trans)) {
>>                 err = PTR_ERR(trans);
>> -       else
>> -               err = btrfs_commit_transaction(trans, rc->extent_root);
>> +               goto out_free;
>> +       }
>> +       err = qgroup_fix_relocated_data_extents(trans, rc);
>> +       if (err < 0) {
>> +               btrfs_abort_transaction(trans, err);
>> +               goto out_free;
>> +       }
>> +       err = btrfs_commit_transaction(trans, rc->extent_root);
>>  out_free:
>>         kfree(rc);
>>  out:
>> --
>> 2.9.2
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5ae..a6ace8a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@ 
 #include "async-thread.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "qgroup.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -3916,6 +3917,95 @@  int prepare_to_relocate(struct reloc_control *rc)
 	return 0;
 }
 
+/*
+ * Qgroup fixer for data chunk relocation.
+ * The data relocation is done in the following steps
+ * 1) Copy data extents into data reloc tree
+ * 2) Create tree reloc tree(special snapshot) for related subvolumes
+ * 3) Modify file extents in tree reloc tree
+ * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
+ *
+ * The problem is, data and tree reloc tree are not accounted to qgroup,
+ * and 4) will only info qgroup to track tree blocks change, not file extents
+ * in the tree blocks.
+ *
+ * The good news is, related data extents are all in data reloc tree, so we
+ * only need to info qgroup to track all file extents in data reloc tree
+ * before commit trans.
+ */
+static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
+					     struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+	struct inode *inode = rc->data_inode;
+	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	if (!fs_info->quota_enabled)
+		return 0;
+
+	/*
+	 * Only for stage where we update data pointers the qgroup fix is
+	 * valid.
+	 * For MOVING_DATA stage, we will miss the timing of swapping tree
+	 * blocks, and won't fix it.
+	 */
+	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
+	while (1) {
+		struct btrfs_file_extent_item *fi;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid > btrfs_ino(inode))
+			break;
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			goto next;
+		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(path->nodes[0], fi) !=
+				BTRFS_FILE_EXTENT_REG)
+			goto next;
+		/*
+		pr_info("disk bytenr: %llu, num_bytes: %llu\n",
+			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi));
+			*/
+		ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
+			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
+			GFP_NOFS);
+		if (ret < 0)
+			break;
+next:
+		ret = btrfs_next_item(data_reloc_root, path);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
 	struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4192,16 @@  restart:
 
 	/* get rid of pinned extents */
 	trans = btrfs_join_transaction(rc->extent_root);
-	if (IS_ERR(trans))
+	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-	else
-		btrfs_commit_transaction(trans, rc->extent_root);
+		goto out_free;
+	}
+	err = qgroup_fix_relocated_data_extents(trans, rc);
+	if (err < 0) {
+		btrfs_abort_transaction(trans, err);
+		goto out_free;
+	}
+	btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
 	btrfs_free_path(path);
@@ -4468,10 +4564,16 @@  int btrfs_recover_relocation(struct btrfs_root *root)
 	unset_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root);
-	if (IS_ERR(trans))
+	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-	else
-		err = btrfs_commit_transaction(trans, rc->extent_root);
+		goto out_free;
+	}
+	err = qgroup_fix_relocated_data_extents(trans, rc);
+	if (err < 0) {
+		btrfs_abort_transaction(trans, err);
+		goto out_free;
+	}
+	err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	kfree(rc);
 out: