btrfs: qgroup: account shared subtrees during snapshot delete

Message ID	1404771000-30940-3-git-send-email-mfasheh@suse.de (mailing list archive)
State	Superseded
Headers	show Return-Path: <linux-btrfs-owner@kernel.org> From: Mark Fasheh <mfasheh@suse.de> To: linux-btrfs@vger.kernel.org Cc: Chris Mason <clm@fb.com>, Josef Bacik <jbacik@fb.com>, Mark Fasheh <mfasheh@suse.de> Subject: btrfs: qgroup: account shared subtrees during snapshot delete Date: Mon, 7 Jul 2014 15:09:59 -0700 Message-Id: <1404771000-30940-3-git-send-email-mfasheh@suse.de> In-Reply-To: <1404771000-30940-1-git-send-email-mfasheh@suse.de> References: <1404771000-30940-1-git-send-email-mfasheh@suse.de> Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 46f39bf..3f43e9a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7324,6 +7324,220 @@ reada: wc->reada_slot = slot; } +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + bytenr, num_bytes, + BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int child_bsize = btrfs_level_size(root, level); + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_bsize, + child_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + child_bytenr, + child_bsize, + BTRFS_QGROUP_OPER_SUB_SUBTREE, + 0); + if (ret) + goto out; + + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + /* * helper to process tree block while walking down the tree. * @@ -7427,6 +7641,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, int level = wc->level; int reada = 0; int ret = 0; + bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -7472,6 +7687,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + need_account = true; if (level == 1 && (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) goto skip; @@ -7535,6 +7751,16 @@ skip: parent = 0; } + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -7621,6 +7847,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -7746,6 +7979,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -7871,6 +8106,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } + /* + * Qgroup update accounting is run from + * delayed ref handling. This usually works + * out because delayed refs are normally the + * only way qgroup updates are added. However, + * we may have added updates during our tree + * walk so run qgroups here to make sure we + * don't lose any updates. + */ + ret = btrfs_delayed_qgroup_accounting(trans, + root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -7924,6 +8177,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a9f0f05..9b0cb93 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1201,6 +1201,50 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } + +static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + /* + * Ignore seq and type here, we're looking for any operation + * at all related to this extent on that root. + */ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + return 0; +} + +static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node *n; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + n = fs_info->qgroup_op_tree.rb_node; + while (n) { + cur = rb_entry(n, struct btrfs_qgroup_operation, n); + cmp = comp_oper_exist(cur, oper); + if (cmp < 0) { + n = n->rb_right; + } else if (cmp) { + n = n->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + static int comp_oper(struct btrfs_qgroup_operation *oper1, struct btrfs_qgroup_operation *oper2) { @@ -1290,7 +1334,25 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); INIT_LIST_HEAD(&oper->elem.list); oper->elem.seq = 0; + trace_btrfs_qgroup_record_ref(oper); + + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { + /* + * If any operation for this bytenr/ref_root combo + * exists, then we know it's not exclusively owned and + * shouldn't be queued up. + * + * This also catches the case where we have a cloned + * extent that gets queued up multiple times during + * drop snapshot. + */ + if (qgroup_oper_exists(fs_info, oper)) { + kfree(oper); + return 0; + } + } + ret = insert_qgroup_oper(fs_info, oper); if (ret) { /* Shouldn't happen so have an assert for developers */ @@ -1883,6 +1945,106 @@ out: } /* + * Process a reference to a shared subtree. This type of operation is + * queued during snapshot removal when we encounter extents which are + * shared between more than one root. + */ +static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup_list *glist; + struct ulist *parents; + int ret = 0; + struct btrfs_qgroup *qg; + u64 root_obj = 0; + struct seq_list elem = {}; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + elem.seq, &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) + return ret; + + if (roots->nnodes != 1) + goto out; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ + /* + * If we find our ref root then that means all refs + * this extent has to the root have not yet been + * deleted. In that case, we do nothing and let the + * last ref for this bytenr drive our update. + * + * This can happen for example if an extent is + * referenced multiple times in a snapshot (clone, + * etc). If we are in the middle of snapshot removal, + * queued updates for such an extent will find the + * root if we have not yet finished removing the + * snapshot. + */ + if (unode->val == oper->ref_root) + goto out; + + root_obj = unode->val; + BUG_ON(!root_obj); + + spin_lock(&fs_info->qgroup_lock); + qg = find_qgroup_rb(fs_info, root_obj); + if (!qg) + goto out_unlock; + + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* + * Adjust counts for parent groups. First we find all + * parents, then in the 2nd loop we do the adjustment + * while adding parents of the parents to our ulist. + */ + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out_unlock; + } + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(parents, &uiter))) { + qg = u64_to_ptr(unode->aux); + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out_unlock; + } + } + +out_unlock: + spin_unlock(&fs_info->qgroup_lock); + +out: + ulist_free(roots); + ulist_free(parents); + return ret; +} + +/* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and * then the space is accounted accordingly to the different roots. The @@ -1921,6 +2083,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, case BTRFS_QGROUP_OPER_SUB_SHARED: ret = qgroup_shared_accounting(trans, fs_info, oper); break; + case BTRFS_QGROUP_OPER_SUB_SUBTREE: + ret = qgroup_subtree_accounting(trans, fs_info, oper); + break; default: ASSERT(0); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 5952ff1..18cc68c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type { BTRFS_QGROUP_OPER_ADD_SHARED, BTRFS_QGROUP_OPER_SUB_EXCL, BTRFS_QGROUP_OPER_SUB_SHARED, + BTRFS_QGROUP_OPER_SUB_SUBTREE, }; struct btrfs_qgroup_operation { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b8774b3..96daac6 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1125,7 +1125,8 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \ { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \ { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \ - { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" }) + { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" }, \ + { BTRFS_QGROUP_OPER_SUB_SUBTREE,"OPER_SUB_SUBTREE" }) DECLARE_EVENT_CLASS(btrfs_qgroup_oper,

btrfs: qgroup: account shared subtrees during snapshot delete

Commit Message

Patch