diff mbox

[08/13] btrfs: convert prelimary reference tracking to use rbtrees

Message ID 20170620160653.19907-9-enadolski@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Edmund Nadolski June 20, 2017, 4:06 p.m. UTC
It's been known for a while that the use of multiple lists
that are periodically merged was an algorithmic problem within
btrfs.  There are several workloads that don't complete in any
reasonable amount of time (e.g. btrfs/130) and others that cause
soft lockups.

The solution is to use a pair of rbtrees that do insertion merging
for both indirect and direct refs, with the former converting
refs into the latter.  The result is a btrfs/130 workload that
used to take several hours now takes about half of that. This
runtime still isn't acceptable and a future patch will address that
by moving the rbtrees higher in the stack so the lookups can be
shared across multiple calls to find_parent_nodes.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/backref.c | 415 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 267 insertions(+), 148 deletions(-)

Comments

Jeff Mahoney June 26, 2017, 5:07 p.m. UTC | #1
On 6/20/17 12:06 PM, Edmund Nadolski wrote:
> It's been known for a while that the use of multiple lists
> that are periodically merged was an algorithmic problem within
> btrfs.  There are several workloads that don't complete in any
> reasonable amount of time (e.g. btrfs/130) and others that cause
> soft lockups.
> 
> The solution is to use a pair of rbtrees that do insertion merging
> for both indirect and direct refs, with the former converting
> refs into the latter.  The result is a btrfs/130 workload that
> used to take several hours now takes about half of that. This
> runtime still isn't acceptable and a future patch will address that
> by moving the rbtrees higher in the stack so the lookups can be
> shared across multiple calls to find_parent_nodes.
> 
> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
[...]

> @@ -504,37 +665,22 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> -static inline int ref_for_same_block(struct prelim_ref *ref1,
> -				     struct prelim_ref *ref2)
> -{
> -	if (ref1->level != ref2->level)
> -		return 0;
> -	if (ref1->root_id != ref2->root_id)
> -		return 0;
> -	if (ref1->key_for_search.type != ref2->key_for_search.type)
> -		return 0;
> -	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
> -		return 0;
> -	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
> -		return 0;
> -	if (ref1->parent != ref2->parent)
> -		return 0;
> -
> -	return 1;
> -}
> -
>  /*
>   * read tree blocks and add keys where required.
>   */
>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
> -			    struct list_head *head)
> +			    struct preftrees *preftrees)
>  {
>  	struct prelim_ref *ref;
>  	struct extent_buffer *eb;
> +	struct rb_node *node = rb_first(&preftrees->indirect.root);
> +
> +	while (node) {
> +		ref = rb_entry(node, struct prelim_ref, rbnode);
> +		node = rb_next(&ref->rbnode);
> +		if (WARN(ref->parent, "BUG: direct ref found in indirect tree"))
> +			return -EINVAL;
>  
> -	list_for_each_entry(ref, head, list) {
> -		if (ref->parent)
> -			continue;
>  		if (ref->key_for_search.type)
>  			continue;
>  		BUG_ON(!ref->wanted_disk_byte);

Hi Ed -

I missed this in earlier review, but this can't work.  We're modifying
the ref in a way that the comparator will care about -- so the node
would move in the tree.

It's not a fatal flaw and, in fact, leaves us an opening to fix a
separate locking issue.

-Jeff
David Sterba June 27, 2017, 4:49 p.m. UTC | #2
On Tue, Jun 20, 2017 at 10:06:48AM -0600, Edmund Nadolski wrote:
> It's been known for a while that the use of multiple lists
> that are periodically merged was an algorithmic problem within
> btrfs.  There are several workloads that don't complete in any
> reasonable amount of time (e.g. btrfs/130) and others that cause
> soft lockups.
> 
> The solution is to use a pair of rbtrees that do insertion merging
> for both indirect and direct refs, with the former converting
> refs into the latter.  The result is a btrfs/130 workload that
> used to take several hours now takes about half of that. This
> runtime still isn't acceptable and a future patch will address that
> by moving the rbtrees higher in the stack so the lookups can be
> shared across multiple calls to find_parent_nodes.
> 
> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/backref.c | 415 ++++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 267 insertions(+), 148 deletions(-)
> 
> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> index 0d1e7cb..daae7b6 100644
> --- a/fs/btrfs/backref.c
> +++ b/fs/btrfs/backref.c
> @@ -129,7 +129,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
>   * this structure records all encountered refs on the way up to the root
>   */
>  struct prelim_ref {
> -	struct list_head list;
> +	struct rb_node rbnode;
>  	u64 root_id;
>  	struct btrfs_key key_for_search;
>  	int level;
> @@ -139,6 +139,17 @@ struct prelim_ref {
>  	u64 wanted_disk_byte;
>  };
>  
> +struct preftree {
> +	struct rb_root root;
> +};
> +
> +#define PREFTREE_INIT	{ .root = RB_ROOT }
> +
> +struct preftrees {
> +	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
> +	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
> +};
> +
>  static struct kmem_cache *btrfs_prelim_ref_cache;
>  
>  int __init btrfs_prelim_ref_init(void)
> @@ -158,6 +169,108 @@ void btrfs_prelim_ref_exit(void)
>  	kmem_cache_destroy(btrfs_prelim_ref_cache);
>  }
>  
> +static void release_pref(struct prelim_ref *ref)
> +{
> +	kmem_cache_free(btrfs_prelim_ref_cache, ref);

This is a bit confusing, 'release' in btrfs code is used to release the
resources but not to actually free the memory. See eg.
btrfs_release_path and btrfs_free_path. As the helper is trivial and I
don't see any other additions to it in the followup patches, I suggest
to drop and opencode it.

> +}
> +
> +/*
> + * Return 0 when both refs are for the same block (and can be merged).
> + * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
> + * indicates a 'higher' block.
> + */
> +static int prelim_ref_compare(struct prelim_ref *ref1,
> +			      struct prelim_ref *ref2)
> +{
> +	if (ref1->level < ref2->level)
> +		return -1;
> +	if (ref1->level > ref2->level)
> +		return 1;
> +	if (ref1->root_id < ref2->root_id)
> +		return -1;
> +	if (ref1->root_id > ref2->root_id)
> +		return 1;
> +	if (ref1->key_for_search.type < ref2->key_for_search.type)
> +		return -1;
> +	if (ref1->key_for_search.type > ref2->key_for_search.type)
> +		return 1;
> +	if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
> +		return -1;
> +	if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
> +		return 1;
> +	if (ref1->key_for_search.offset < ref2->key_for_search.offset)
> +		return -1;
> +	if (ref1->key_for_search.offset > ref2->key_for_search.offset)
> +		return 1;
> +	if (ref1->parent < ref2->parent)
> +		return -1;
> +	if (ref1->parent > ref2->parent)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * Add @newref to the @root rbtree, merging identical refs.
> + *
> + * Callers should assumed that newref has been freed after calling.
> + */
> +static void prelim_ref_insert(struct preftree *preftree,
> +			      struct prelim_ref *newref)
> +{
> +	struct rb_root *root;
> +	struct rb_node **p;
> +	struct rb_node *parent = NULL;
> +	struct prelim_ref *ref;
> +	int result;
> +
> +	root = &preftree->root;
> +	p = &root->rb_node;
> +
> +	while (*p) {
> +		parent = *p;
> +		ref = rb_entry(parent, struct prelim_ref, rbnode);
> +		result = prelim_ref_compare(ref, newref);
> +		if (result < 0) {
> +			p = &(*p)->rb_left;
> +		} else if (result > 0) {
> +			p = &(*p)->rb_right;
> +		} else {
> +			/* Identical refs, merge them and free @newref */
> +			struct extent_inode_elem *eie = ref->inode_list;
> +
> +			while (eie && eie->next)
> +				eie = eie->next;
> +
> +			if (!eie)
> +				ref->inode_list = newref->inode_list;
> +			else
> +				eie->next = newref->inode_list;
> +			ref->count += newref->count;
> +			release_pref(newref);
> +			return;
> +		}
> +	}
> +
> +	rb_link_node(&newref->rbnode, parent, p);
> +	rb_insert_color(&newref->rbnode, root);
> +}
> +
> +/*
> + * Release the entire tree.  We don't care about internal consistency so
> + * just free everything and then reset the tree root.
> + */
> +static void prelim_release(struct preftree *preftree)
> +{
> +	struct prelim_ref *ref, *next_ref;
> +
> +	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
> +					     rbnode)
> +		release_pref(ref);
> +
> +	preftree->root = RB_ROOT;
> +}
> +
>  /*
>   * the rules for all callers of this function are:
>   * - obtaining the parent is the goal
> @@ -196,7 +309,7 @@ void btrfs_prelim_ref_exit(void)
>   * additional information that's available but not required to find the parent
>   * block might help in merging entries to gain some speed.
>   */
> -static int add_prelim_ref(struct list_head *head, u64 root_id,
> +static int add_prelim_ref(struct preftree *preftree, u64 root_id,
>  			  const struct btrfs_key *key, int level, u64 parent,
>  			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
>  {
> @@ -243,11 +356,29 @@ static int add_prelim_ref(struct list_head *head, u64 root_id,
>  	ref->count = count;
>  	ref->parent = parent;
>  	ref->wanted_disk_byte = wanted_disk_byte;
> -	list_add_tail(&ref->list, head);
> +	prelim_ref_insert(preftree, ref);
>  
>  	return 0;
>  }
>  
> +/* direct refs use root == 0, key == NULL */
> +static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
> +			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
> +{
> +	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
> +			      wanted_disk_byte, count, gfp_mask);
> +}
> +
> +/* indirect refs use parent == 0 */
> +static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
> +			    const struct btrfs_key *key, int level,
> +			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
> +{
> +	return add_prelim_ref(&preftrees->indirect, root_id, key, level, 0,
> +			      wanted_disk_byte, count, gfp_mask);
> +}
> +
> +
>  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
>  			   struct ulist *parents, struct prelim_ref *ref,
>  			   int level, u64 time_seq, const u64 *extent_item_pos,
> @@ -429,38 +560,58 @@ unode_aux_to_inode_list(struct ulist_node *node)
>  }
>  
>  /*
> - * resolve all indirect backrefs from the list
> + * We maintain two separate rbtrees: one for indirect refs and one for
> + * direct refs. Each tree does merge on insertion.  Once all of the
> + * refs have been located, we iterate over the indirect ref tree, resolve
> + * each reference and remove it from the indirect tree, and then insert
> + * the resolved reference into the direct tree, merging there too.
> + *
> + * New backrefs (i.e., for parent nodes) are added to the direct/indirect
> + * rbtrees as they are encountered.  The new, indirect backrefs are
> + * resolved as above.
>   */
>  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  				 struct btrfs_path *path, u64 time_seq,
> -				 struct list_head *head,
> +				 struct preftrees *preftrees,
>  				 const u64 *extent_item_pos, u64 total_refs,
>  				 u64 root_objectid)
>  {
>  	int err;
>  	int ret = 0;
> -	struct prelim_ref *ref;
> -	struct prelim_ref *ref_safe;
> -	struct prelim_ref *new_ref;
>  	struct ulist *parents;
>  	struct ulist_node *node;
>  	struct ulist_iterator uiter;
> +	struct rb_node *rnode;
>  
>  	parents = ulist_alloc(GFP_NOFS);
>  	if (!parents)
>  		return -ENOMEM;
>  
>  	/*
> -	 * _safe allows us to insert directly after the current item without
> -	 * iterating over the newly inserted items.
> -	 * we're also allowed to re-assign ref during iteration.
> +	 * We could trade memory usage for performance here by iterating
> +	 * the tree, allocating new refs for each insertion, and then
> +	 * freeing the entire indirect tree when we're done.  In some test
> +	 * cases, the tree can grow quite large (~200k objects).
>  	 */
> -	list_for_each_entry_safe(ref, ref_safe, head, list) {
> -		if (ref->parent)	/* already direct */
> -			continue;
> -		if (ref->count == 0)
> +	while ((rnode = rb_first(&preftrees->indirect.root))) {
> +		struct prelim_ref *ref;
> +
> +		ref = rb_entry(rnode, struct prelim_ref, rbnode);
> +		if (WARN(ref->parent,
> +			 "BUG: direct ref found in indirect tree")) {

Is this meant as an assertion or do you expect this could also happen
during normal operation?

> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		rb_erase(&ref->rbnode, &preftrees->indirect.root);
> +
> +		if (ref->count == 0) {
> +			release_pref(ref);
>  			continue;
> +		}
> +
>  		if (root_objectid && ref->root_id != root_objectid) {
> +			release_pref(ref);
>  			ret = BACKREF_FOUND_SHARED;
>  			goto out;
>  		}
> @@ -472,8 +623,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  		 * and return directly.
>  		 */
>  		if (err == -ENOENT) {
> +			/* This only happens when we're doing sanity testing */
> +			release_pref(ref);
>  			continue;
>  		} else if (err) {
> +			release_pref(ref);
>  			ret = err;
>  			goto out;
>  		}
> @@ -484,19 +638,26 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  		ref->parent = node ? node->val : 0;
>  		ref->inode_list = unode_aux_to_inode_list(node);
>  
> -		/* additional parents require new refs being added here */
> +		/* Add a prelim_ref(s) for any other parent(s). */
>  		while ((node = ulist_next(parents, &uiter))) {
> +			struct prelim_ref *new_ref;
> +
>  			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
>  						   GFP_NOFS);
>  			if (!new_ref) {
> +				release_pref(ref);
>  				ret = -ENOMEM;
>  				goto out;
>  			}
>  			memcpy(new_ref, ref, sizeof(*ref));
>  			new_ref->parent = node->val;
>  			new_ref->inode_list = unode_aux_to_inode_list(node);
> -			list_add(&new_ref->list, &ref->list);
> +			prelim_ref_insert(&preftrees->direct, new_ref);
>  		}
> +
> +		/* Now it's a direct ref, put it in the the direct tree */
> +		prelim_ref_insert(&preftrees->direct, ref);
> +
>  		ulist_reinit(parents);
>  	}
>  out:
> @@ -504,37 +665,22 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> -static inline int ref_for_same_block(struct prelim_ref *ref1,
> -				     struct prelim_ref *ref2)
> -{
> -	if (ref1->level != ref2->level)
> -		return 0;
> -	if (ref1->root_id != ref2->root_id)
> -		return 0;
> -	if (ref1->key_for_search.type != ref2->key_for_search.type)
> -		return 0;
> -	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
> -		return 0;
> -	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
> -		return 0;
> -	if (ref1->parent != ref2->parent)
> -		return 0;
> -
> -	return 1;
> -}
> -
>  /*
>   * read tree blocks and add keys where required.
>   */
>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
> -			    struct list_head *head)
> +			    struct preftrees *preftrees)
>  {
>  	struct prelim_ref *ref;
>  	struct extent_buffer *eb;
> +	struct rb_node *node = rb_first(&preftrees->indirect.root);
> +
> +	while (node) {
> +		ref = rb_entry(node, struct prelim_ref, rbnode);
> +		node = rb_next(&ref->rbnode);
> +		if (WARN(ref->parent, "BUG: direct ref found in indirect tree"))
> +			return -EINVAL;
>  
> -	list_for_each_entry(ref, head, list) {
> -		if (ref->parent)
> -			continue;
>  		if (ref->key_for_search.type)
>  			continue;
>  		BUG_ON(!ref->wanted_disk_byte);
> @@ -557,57 +703,11 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
>  }
>  
>  /*
> - * merge backrefs and adjust counts accordingly
> - *
> - *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
> - *           then we can merge more here. Additionally, we could even add a key
> - *           range for the blocks we looked into to merge even more (-> replace
> - *           unresolved refs by those having a parent).
> - */
> -static void merge_refs(struct list_head *head, enum merge_mode mode)
> -{
> -	struct prelim_ref *pos1;
> -
> -	list_for_each_entry(pos1, head, list) {
> -		struct prelim_ref *pos2 = pos1, *tmp;
> -
> -		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
> -			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
> -			struct extent_inode_elem *eie;
> -
> -			if (!ref_for_same_block(ref1, ref2))
> -				continue;
> -			if (mode == MERGE_IDENTICAL_KEYS) {
> -				if (!ref1->parent && ref2->parent)
> -					swap(ref1, ref2);
> -			} else {
> -				if (ref1->parent != ref2->parent)
> -					continue;
> -			}
> -
> -			eie = ref1->inode_list;
> -			while (eie && eie->next)
> -				eie = eie->next;
> -			if (eie)
> -				eie->next = ref2->inode_list;
> -			else
> -				ref1->inode_list = ref2->inode_list;
> -			ref1->count += ref2->count;
> -
> -			list_del(&ref2->list);
> -			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
> -			cond_resched();
> -		}
> -
> -	}
> -}
> -
> -/*
>   * add all currently queued delayed refs from this head whose seq nr is
>   * smaller or equal that seq to the list
>   */
>  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
> -			    struct list_head *prefs, u64 *total_refs,
> +			    struct preftrees *preftrees, u64 *total_refs,
>  			    u64 inum)
>  {
>  	struct btrfs_delayed_ref_node *node;
> @@ -642,24 +742,30 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>  		*total_refs += (node->ref_mod * sgn);
>  		switch (node->type) {
>  		case BTRFS_TREE_BLOCK_REF_KEY: {
> +			/* NORMAL INDIRECT METADATA backref */
>  			struct btrfs_delayed_tree_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_tree_ref(node);
> -			ret = add_prelim_ref(prefs, ref->root, &op_key,
> -					     ref->level + 1, 0, node->bytenr,
> -					     node->ref_mod * sgn, GFP_ATOMIC);
> +			ret = add_indirect_ref(preftrees, ref->root, &op_key,
> +					       ref->level + 1, node->bytenr,
> +					       node->ref_mod * sgn,
> +					       GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_SHARED_BLOCK_REF_KEY: {
> +			/* SHARED DIRECT METADATA backref */
>  			struct btrfs_delayed_tree_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_tree_ref(node);
> -			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
> +
> +			ret = add_direct_ref(preftrees, ref->level + 1,
>  					     ref->parent, node->bytenr,
> -					     node->ref_mod * sgn, GFP_ATOMIC);
> +					     node->ref_mod * sgn,
> +					     GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
> +			/* NORMAL INDIRECT DATA backref */
>  			struct btrfs_delayed_data_ref *ref;
>  			ref = btrfs_delayed_node_to_data_ref(node);
>  
> @@ -676,17 +782,21 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>  				break;
>  			}
>  
> -			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
> -					     node->bytenr, node->ref_mod * sgn,
> -					     GFP_ATOMIC);
> +			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
> +					       node->bytenr,
> +					       node->ref_mod * sgn,
> +					       GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_SHARED_DATA_REF_KEY: {
> +			/* SHARED DIRECT FULL backref */
>  			struct btrfs_delayed_data_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_data_ref(node);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
> -					     node->bytenr, node->ref_mod * sgn,
> +
> +			ret = add_direct_ref(preftrees, 0, ref->parent,
> +					     node->bytenr,
> +					     node->ref_mod * sgn,
>  					     GFP_ATOMIC);
>  			break;
>  		}
> @@ -704,7 +814,7 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>   * add all inline backrefs for bytenr to the list
>   */
>  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
> -			   int *info_level, struct list_head *prefs,
> +			   int *info_level, struct preftrees *preftrees,
>  			   u64 *total_refs, u64 inum)
>  {
>  	int ret = 0;
> @@ -760,8 +870,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  
>  		switch (type) {
>  		case BTRFS_SHARED_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
> -					     offset, bytenr, 1, GFP_NOFS);
> +			ret = add_direct_ref(preftrees, *info_level + 1, offset,
> +					     bytenr, 1, GFP_NOFS);
>  			break;
>  		case BTRFS_SHARED_DATA_REF_KEY: {
>  			struct btrfs_shared_data_ref *sdref;
> @@ -769,14 +879,15 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  
>  			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
>  			count = btrfs_shared_data_ref_count(leaf, sdref);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
> +
> +			ret = add_direct_ref(preftrees, 0, offset,
>  					     bytenr, count, GFP_NOFS);
>  			break;
>  		}
>  		case BTRFS_TREE_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, offset, NULL,
> -					     *info_level + 1, 0,
> -					     bytenr, 1, GFP_NOFS);
> +			ret = add_indirect_ref(preftrees, offset, NULL,
> +					       *info_level + 1, bytenr, 1,
> +					       GFP_NOFS);
>  			break;
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
>  			struct btrfs_extent_data_ref *dref;
> @@ -796,8 +907,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  			}
>  
>  			root = btrfs_extent_data_ref_root(leaf, dref);
> -			ret = add_prelim_ref(prefs, root, &key, 0, 0,
> -					     bytenr, count, GFP_NOFS);
> +
> +			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
> +					       count, GFP_NOFS);
>  			break;
>  		}
>  		default:
> @@ -816,7 +928,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>   */
>  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  			  struct btrfs_path *path, u64 bytenr,
> -			  int info_level, struct list_head *prefs, u64 inum)
> +			  int info_level, struct preftrees *preftrees,
> +			  u64 inum)
>  {
>  	struct btrfs_root *extent_root = fs_info->extent_root;
>  	int ret;
> @@ -846,26 +959,31 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  
>  		switch (key.type) {
>  		case BTRFS_SHARED_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
> -					     key.offset, bytenr, 1, GFP_NOFS);
> +			/* SHARED DIRECT METADATA backref */
> +			ret = add_direct_ref(preftrees, info_level + 1,
> +					     key.offset, bytenr, 1,
> +					     GFP_NOFS);
>  			break;
>  		case BTRFS_SHARED_DATA_REF_KEY: {
> +			/* SHARED DIRECT FULL backref */
>  			struct btrfs_shared_data_ref *sdref;
>  			int count;
>  
>  			sdref = btrfs_item_ptr(leaf, slot,
>  					      struct btrfs_shared_data_ref);
>  			count = btrfs_shared_data_ref_count(leaf, sdref);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
> -					     bytenr, count, GFP_NOFS);
> +			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
> +					     count, GFP_NOFS);
>  			break;
>  		}
>  		case BTRFS_TREE_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, key.offset, NULL,
> -					     info_level + 1, 0,
> -					     bytenr, 1, GFP_NOFS);
> +			/* NORMAL INDIRECT METADATA backref */
> +			ret = add_indirect_ref(preftrees, key.offset, NULL,
> +					       info_level + 1, bytenr, 1,
> +					       GFP_NOFS);
>  			break;
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
> +			/* NORMAL INDIRECT DATA backref */
>  			struct btrfs_extent_data_ref *dref;
>  			int count;
>  			u64 root;
> @@ -884,8 +1002,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  			}
>  
>  			root = btrfs_extent_data_ref_root(leaf, dref);
> -			ret = add_prelim_ref(prefs, root, &key, 0, 0,
> -					     bytenr, count, GFP_NOFS);
> +			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
> +					       count, GFP_NOFS);
>  			break;
>  		}
>  		default:
> @@ -926,14 +1044,15 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  	struct btrfs_delayed_ref_head *head;
>  	int info_level = 0;
>  	int ret;
> -	struct list_head prefs_delayed;
> -	struct list_head prefs;
>  	struct prelim_ref *ref;
> +	struct rb_node *node;
>  	struct extent_inode_elem *eie = NULL;
> +	/* total of both direct AND indirect refs! */
>  	u64 total_refs = 0;
> -
> -	INIT_LIST_HEAD(&prefs);
> -	INIT_LIST_HEAD(&prefs_delayed);
> +	struct preftrees preftrees = {
> +		.direct = PREFTREE_INIT,
> +		.indirect = PREFTREE_INIT
> +	};
>  
>  	key.objectid = bytenr;
>  	key.offset = (u64)-1;
> @@ -996,15 +1115,18 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  				goto again;
>  			}
>  			spin_unlock(&delayed_refs->lock);
> -			ret = add_delayed_refs(head, time_seq,
> -					       &prefs_delayed, &total_refs,
> -					       inum);
> +			ret = add_delayed_refs(head, time_seq, &preftrees,
> +					       &total_refs, inum);
>  			mutex_unlock(&head->mutex);
>  			if (ret)
>  				goto out;
>  		} else {
>  			spin_unlock(&delayed_refs->lock);
>  		}
> +
> +		/*
> +		 * TODO - implement short circuit for BACKREF_FOUND_SHARED
> +		 */

Please don't add TODOs to code, they tend to stick there indefinetelly.
Although this is not the case in your patchset as it gets removed later,
we don't want to encourage such practice.

>  	}
>  
>  	if (path->slots[0]) {
> @@ -1019,35 +1141,41 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
>  		     key.type == BTRFS_METADATA_ITEM_KEY)) {
>  			ret = add_inline_refs(path, bytenr, &info_level,
> -					      &prefs, &total_refs, inum);
> +					      &preftrees, &total_refs, inum);
>  			if (ret)
>  				goto out;
>  			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
> -					     &prefs, inum);
> +					     &preftrees, inum);
>  			if (ret)
>  				goto out;
>  		}
>  	}
> -	btrfs_release_path(path);
>  
> -	list_splice_init(&prefs_delayed, &prefs);
> +	btrfs_release_path(path);
>  
> -	ret = add_missing_keys(fs_info, &prefs);
> +	ret = add_missing_keys(fs_info, &preftrees);
>  	if (ret)
>  		goto out;
>  
> -	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
> -
> -	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
> +	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
>  				    extent_item_pos, total_refs,
>  				    root_objectid);
>  	if (ret)
>  		goto out;
>  
> -	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
> +	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
>  
> -	while (!list_empty(&prefs)) {
> -		ref = list_first_entry(&prefs, struct prelim_ref, list);
> +	/*
> +	 * This walks the tree of merged and resolved refs. Tree blocks are
> +	 * read in as needed. Unique entries are added to the ulist, and
> +	 * the list of found roots is updated.
> +	 *
> +	 * We release the entire tree in one go before returning.
> +	 */
> +	node = rb_first(&preftrees.direct.root);
> +	while (node) {
> +		ref = rb_entry(node, struct prelim_ref, rbnode);
> +		node = rb_next(&ref->rbnode);
>  		WARN_ON(ref->count < 0);
>  		if (roots && ref->count && ref->root_id && ref->parent == 0) {
>  			if (root_objectid && ref->root_id != root_objectid) {
> @@ -1101,23 +1229,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  			}
>  			eie = NULL;
>  		}
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
>  	}
>  
>  out:
>  	btrfs_free_path(path);
> -	while (!list_empty(&prefs)) {
> -		ref = list_first_entry(&prefs, struct prelim_ref, list);
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
> -	}
> -	while (!list_empty(&prefs_delayed)) {
> -		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
> -				       list);
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
> -	}
> +
> +	prelim_release(&preftrees.direct);
> +	prelim_release(&preftrees.indirect);
> +
>  	if (ret < 0)
>  		free_inode_elem_list(eie);
>  	return ret;

Overall, the patch appears big but I don't see a good way to split it,
only to introduce the helpers separately, not much else seems to be
independent.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Mahoney June 27, 2017, 8:09 p.m. UTC | #3
On 6/26/17 1:07 PM, Jeff Mahoney wrote:
> On 6/20/17 12:06 PM, Edmund Nadolski wrote:
>> It's been known for a while that the use of multiple lists
>> that are periodically merged was an algorithmic problem within
>> btrfs.  There are several workloads that don't complete in any
>> reasonable amount of time (e.g. btrfs/130) and others that cause
>> soft lockups.
>>
>> The solution is to use a pair of rbtrees that do insertion merging
>> for both indirect and direct refs, with the former converting
>> refs into the latter.  The result is a btrfs/130 workload that
>> used to take several hours now takes about half of that. This
>> runtime still isn't acceptable and a future patch will address that
>> by moving the rbtrees higher in the stack so the lookups can be
>> shared across multiple calls to find_parent_nodes.
>>
>> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> [...]
> 
>> @@ -504,37 +665,22 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>>  	return ret;
>>  }
>>  
>> -static inline int ref_for_same_block(struct prelim_ref *ref1,
>> -				     struct prelim_ref *ref2)
>> -{
>> -	if (ref1->level != ref2->level)
>> -		return 0;
>> -	if (ref1->root_id != ref2->root_id)
>> -		return 0;
>> -	if (ref1->key_for_search.type != ref2->key_for_search.type)
>> -		return 0;
>> -	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
>> -		return 0;
>> -	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
>> -		return 0;
>> -	if (ref1->parent != ref2->parent)
>> -		return 0;
>> -
>> -	return 1;
>> -}
>> -
>>  /*
>>   * read tree blocks and add keys where required.
>>   */
>>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
>> -			    struct list_head *head)
>> +			    struct preftrees *preftrees)
>>  {
>>  	struct prelim_ref *ref;
>>  	struct extent_buffer *eb;
>> +	struct rb_node *node = rb_first(&preftrees->indirect.root);
>> +
>> +	while (node) {
>> +		ref = rb_entry(node, struct prelim_ref, rbnode);
>> +		node = rb_next(&ref->rbnode);
>> +		if (WARN(ref->parent, "BUG: direct ref found in indirect tree"))
>> +			return -EINVAL;
>>  
>> -	list_for_each_entry(ref, head, list) {
>> -		if (ref->parent)
>> -			continue;
>>  		if (ref->key_for_search.type)
>>  			continue;
>>  		BUG_ON(!ref->wanted_disk_byte);
> 
> Hi Ed -
> 
> I missed this in earlier review, but this can't work.  We're modifying
> the ref in a way that the comparator will care about -- so the node
> would move in the tree.
> 
> It's not a fatal flaw and, in fact, leaves us an opening to fix a
> separate locking issue.

Ed and I discussed this offline.  It turns out that this is a code bug
but not a functional bug.  Once we hit add_missing_keys, we don't do any
more inserts or searches.  We only iterate over every node and remove
each as we go, so the tree order doesn't matter.  I'll post a fix shortly.

-Jeff
Jeff Mahoney June 27, 2017, 9:10 p.m. UTC | #4
On 6/27/17 12:49 PM, David Sterba wrote:
> On Tue, Jun 20, 2017 at 10:06:48AM -0600, Edmund Nadolski wrote:
>> It's been known for a while that the use of multiple lists
>> that are periodically merged was an algorithmic problem within
>> btrfs.  There are several workloads that don't complete in any
>> reasonable amount of time (e.g. btrfs/130) and others that cause
>> soft lockups.
>>
>> The solution is to use a pair of rbtrees that do insertion merging
>> for both indirect and direct refs, with the former converting
>> refs into the latter.  The result is a btrfs/130 workload that
>> used to take several hours now takes about half of that. This
>> runtime still isn't acceptable and a future patch will address that
>> by moving the rbtrees higher in the stack so the lookups can be
>> shared across multiple calls to find_parent_nodes.
>>
>> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
>> ---
>>  fs/btrfs/backref.c | 415 ++++++++++++++++++++++++++++++++++-------------------
>>  1 file changed, 267 insertions(+), 148 deletions(-)
>>
>> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
>> index 0d1e7cb..daae7b6 100644
>> --- a/fs/btrfs/backref.c
>> +++ b/fs/btrfs/backref.c
>> @@ -129,7 +129,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
>>   * this structure records all encountered refs on the way up to the root
>>   */
>>  struct prelim_ref {
>> -	struct list_head list;
>> +	struct rb_node rbnode;
>>  	u64 root_id;
>>  	struct btrfs_key key_for_search;
>>  	int level;
>> @@ -139,6 +139,17 @@ struct prelim_ref {
>>  	u64 wanted_disk_byte;
>>  };
>>  
>> +struct preftree {
>> +	struct rb_root root;
>> +};
>> +
>> +#define PREFTREE_INIT	{ .root = RB_ROOT }
>> +
>> +struct preftrees {
>> +	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
>> +	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
>> +};
>> +
>>  static struct kmem_cache *btrfs_prelim_ref_cache;
>>  
>>  int __init btrfs_prelim_ref_init(void)
>> @@ -158,6 +169,108 @@ void btrfs_prelim_ref_exit(void)
>>  	kmem_cache_destroy(btrfs_prelim_ref_cache);
>>  }
>>  
>> +static void release_pref(struct prelim_ref *ref)
>> +{
>> +	kmem_cache_free(btrfs_prelim_ref_cache, ref);
> 
> This is a bit confusing, 'release' in btrfs code is used to release the
> resources but not to actually free the memory. See eg.
> btrfs_release_path and btrfs_free_path. As the helper is trivial and I
> don't see any other additions to it in the followup patches, I suggest
> to drop and opencode it.

I don't mind renaming it to free_pref, but free_pref(ref) is a whole lot
easier to read and write than the alternative.

>> @@ -429,38 +560,58 @@ unode_aux_to_inode_list(struct ulist_node *node)
>>  }
>>  
>>  /*
>> - * resolve all indirect backrefs from the list
>> + * We maintain two separate rbtrees: one for indirect refs and one for
>> + * direct refs. Each tree does merge on insertion.  Once all of the
>> + * refs have been located, we iterate over the indirect ref tree, resolve
>> + * each reference and remove it from the indirect tree, and then insert
>> + * the resolved reference into the direct tree, merging there too.
>> + *
>> + * New backrefs (i.e., for parent nodes) are added to the direct/indirect
>> + * rbtrees as they are encountered.  The new, indirect backrefs are
>> + * resolved as above.
>>   */
>>  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>>  				 struct btrfs_path *path, u64 time_seq,
>> -				 struct list_head *head,
>> +				 struct preftrees *preftrees,
>>  				 const u64 *extent_item_pos, u64 total_refs,
>>  				 u64 root_objectid)
>>  {
>>  	int err;
>>  	int ret = 0;
>> -	struct prelim_ref *ref;
>> -	struct prelim_ref *ref_safe;
>> -	struct prelim_ref *new_ref;
>>  	struct ulist *parents;
>>  	struct ulist_node *node;
>>  	struct ulist_iterator uiter;
>> +	struct rb_node *rnode;
>>  
>>  	parents = ulist_alloc(GFP_NOFS);
>>  	if (!parents)
>>  		return -ENOMEM;
>>  
>>  	/*
>> -	 * _safe allows us to insert directly after the current item without
>> -	 * iterating over the newly inserted items.
>> -	 * we're also allowed to re-assign ref during iteration.
>> +	 * We could trade memory usage for performance here by iterating
>> +	 * the tree, allocating new refs for each insertion, and then
>> +	 * freeing the entire indirect tree when we're done.  In some test
>> +	 * cases, the tree can grow quite large (~200k objects).
>>  	 */
>> -	list_for_each_entry_safe(ref, ref_safe, head, list) {
>> -		if (ref->parent)	/* already direct */
>> -			continue;
>> -		if (ref->count == 0)
>> +	while ((rnode = rb_first(&preftrees->indirect.root))) {
>> +		struct prelim_ref *ref;
>> +
>> +		ref = rb_entry(rnode, struct prelim_ref, rbnode);
>> +		if (WARN(ref->parent,
>> +			 "BUG: direct ref found in indirect tree")) {
> 
> Is this meant as an assertion or do you expect this could also happen
> during normal operation?

This one was mine.

It's not meant to be countered in normal operation, but I've been trying
to (and not always succeeding at) taking Linus's advice about gracefully
recovering from errors rather than crashing.  Our other two tools are
BUG_ON, which will crash the kernel or ASSERT, which will crash the
kernel only if the check is enabled and happily break otherwise.
> Overall, the patch appears big but I don't see a good way to split it,
> only to introduce the helpers separately, not much else seems to be
> independent.

Is that a request to split the helpers out or an observation that doing
so would be the only reason to make it smaller?

-Jeff
David Sterba July 4, 2017, 5:02 p.m. UTC | #5
On Tue, Jun 27, 2017 at 05:10:58PM -0400, Jeff Mahoney wrote:
> On 6/27/17 12:49 PM, David Sterba wrote:
> > On Tue, Jun 20, 2017 at 10:06:48AM -0600, Edmund Nadolski wrote:
> >> It's been known for a while that the use of multiple lists
> >> that are periodically merged was an algorithmic problem within
> >> btrfs.  There are several workloads that don't complete in any
> >> reasonable amount of time (e.g. btrfs/130) and others that cause
> >> soft lockups.
> >>
> >> The solution is to use a pair of rbtrees that do insertion merging
> >> for both indirect and direct refs, with the former converting
> >> refs into the latter.  The result is a btrfs/130 workload that
> >> used to take several hours now takes about half of that. This
> >> runtime still isn't acceptable and a future patch will address that
> >> by moving the rbtrees higher in the stack so the lookups can be
> >> shared across multiple calls to find_parent_nodes.
> >>
> >> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> >> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> >> ---
> >>  fs/btrfs/backref.c | 415 ++++++++++++++++++++++++++++++++++-------------------
> >>  1 file changed, 267 insertions(+), 148 deletions(-)
> >>
> >> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> >> index 0d1e7cb..daae7b6 100644
> >> --- a/fs/btrfs/backref.c
> >> +++ b/fs/btrfs/backref.c
> >> @@ -129,7 +129,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
> >>   * this structure records all encountered refs on the way up to the root
> >>   */
> >>  struct prelim_ref {
> >> -	struct list_head list;
> >> +	struct rb_node rbnode;
> >>  	u64 root_id;
> >>  	struct btrfs_key key_for_search;
> >>  	int level;
> >> @@ -139,6 +139,17 @@ struct prelim_ref {
> >>  	u64 wanted_disk_byte;
> >>  };
> >>  
> >> +struct preftree {
> >> +	struct rb_root root;
> >> +};
> >> +
> >> +#define PREFTREE_INIT	{ .root = RB_ROOT }
> >> +
> >> +struct preftrees {
> >> +	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
> >> +	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
> >> +};
> >> +
> >>  static struct kmem_cache *btrfs_prelim_ref_cache;
> >>  
> >>  int __init btrfs_prelim_ref_init(void)
> >> @@ -158,6 +169,108 @@ void btrfs_prelim_ref_exit(void)
> >>  	kmem_cache_destroy(btrfs_prelim_ref_cache);
> >>  }
> >>  
> >> +static void release_pref(struct prelim_ref *ref)
> >> +{
> >> +	kmem_cache_free(btrfs_prelim_ref_cache, ref);
> > 
> > This is a bit confusing, 'release' in btrfs code is used to release the
> > resources but not to actually free the memory. See eg.
> > btrfs_release_path and btrfs_free_path. As the helper is trivial and I
> > don't see any other additions to it in the followup patches, I suggest
> > to drop and opencode it.
> 
> I don't mind renaming it to free_pref, but free_pref(ref) is a whole lot
> easier to read and write than the alternative.
> 
> >> @@ -429,38 +560,58 @@ unode_aux_to_inode_list(struct ulist_node *node)
> >>  }
> >>  
> >>  /*
> >> - * resolve all indirect backrefs from the list
> >> + * We maintain two separate rbtrees: one for indirect refs and one for
> >> + * direct refs. Each tree does merge on insertion.  Once all of the
> >> + * refs have been located, we iterate over the indirect ref tree, resolve
> >> + * each reference and remove it from the indirect tree, and then insert
> >> + * the resolved reference into the direct tree, merging there too.
> >> + *
> >> + * New backrefs (i.e., for parent nodes) are added to the direct/indirect
> >> + * rbtrees as they are encountered.  The new, indirect backrefs are
> >> + * resolved as above.
> >>   */
> >>  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
> >>  				 struct btrfs_path *path, u64 time_seq,
> >> -				 struct list_head *head,
> >> +				 struct preftrees *preftrees,
> >>  				 const u64 *extent_item_pos, u64 total_refs,
> >>  				 u64 root_objectid)
> >>  {
> >>  	int err;
> >>  	int ret = 0;
> >> -	struct prelim_ref *ref;
> >> -	struct prelim_ref *ref_safe;
> >> -	struct prelim_ref *new_ref;
> >>  	struct ulist *parents;
> >>  	struct ulist_node *node;
> >>  	struct ulist_iterator uiter;
> >> +	struct rb_node *rnode;
> >>  
> >>  	parents = ulist_alloc(GFP_NOFS);
> >>  	if (!parents)
> >>  		return -ENOMEM;
> >>  
> >>  	/*
> >> -	 * _safe allows us to insert directly after the current item without
> >> -	 * iterating over the newly inserted items.
> >> -	 * we're also allowed to re-assign ref during iteration.
> >> +	 * We could trade memory usage for performance here by iterating
> >> +	 * the tree, allocating new refs for each insertion, and then
> >> +	 * freeing the entire indirect tree when we're done.  In some test
> >> +	 * cases, the tree can grow quite large (~200k objects).
> >>  	 */
> >> -	list_for_each_entry_safe(ref, ref_safe, head, list) {
> >> -		if (ref->parent)	/* already direct */
> >> -			continue;
> >> -		if (ref->count == 0)
> >> +	while ((rnode = rb_first(&preftrees->indirect.root))) {
> >> +		struct prelim_ref *ref;
> >> +
> >> +		ref = rb_entry(rnode, struct prelim_ref, rbnode);
> >> +		if (WARN(ref->parent,
> >> +			 "BUG: direct ref found in indirect tree")) {
> > 
> > Is this meant as an assertion or do you expect this could also happen
> > during normal operation?
> 
> This one was mine.
> 
> It's not meant to be countered in normal operation, but I've been trying
> to (and not always succeeding at) taking Linus's advice about gracefully
> recovering from errors rather than crashing.  Our other two tools are
> BUG_ON, which will crash the kernel or ASSERT, which will crash the
> kernel only if the check is enabled and happily break otherwise.

Ok, so WARN is fine here.

> > Overall, the patch appears big but I don't see a good way to split it,
> > only to introduce the helpers separately, not much else seems to be
> > independent.
> 
> Is that a request to split the helpers out or an observation that doing
> so would be the only reason to make it smaller?

That was my impression from the review, if there's more code in one
patch, it takes time to see which parts are important. So if the helpers
are separated out, it (subjectively) reduces the cognitive load. As
there was no clear way how to split the patch it was not a request but a
feedback from the review side.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 0d1e7cb..daae7b6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -129,7 +129,7 @@  static int find_extent_in_eb(const struct extent_buffer *eb,
  * this structure records all encountered refs on the way up to the root
  */
 struct prelim_ref {
-	struct list_head list;
+	struct rb_node rbnode;
 	u64 root_id;
 	struct btrfs_key key_for_search;
 	int level;
@@ -139,6 +139,17 @@  struct prelim_ref {
 	u64 wanted_disk_byte;
 };
 
+struct preftree {
+	struct rb_root root;
+};
+
+#define PREFTREE_INIT	{ .root = RB_ROOT }
+
+struct preftrees {
+	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
+	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
+};
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -158,6 +169,108 @@  void btrfs_prelim_ref_exit(void)
 	kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 
+static void release_pref(struct prelim_ref *ref)
+{
+	kmem_cache_free(btrfs_prelim_ref_cache, ref);
+}
+
+/*
+ * Return 0 when both refs are for the same block (and can be merged).
+ * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
+ * indicates a 'higher' block.
+ */
+static int prelim_ref_compare(struct prelim_ref *ref1,
+			      struct prelim_ref *ref2)
+{
+	if (ref1->level < ref2->level)
+		return -1;
+	if (ref1->level > ref2->level)
+		return 1;
+	if (ref1->root_id < ref2->root_id)
+		return -1;
+	if (ref1->root_id > ref2->root_id)
+		return 1;
+	if (ref1->key_for_search.type < ref2->key_for_search.type)
+		return -1;
+	if (ref1->key_for_search.type > ref2->key_for_search.type)
+		return 1;
+	if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
+		return -1;
+	if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
+		return 1;
+	if (ref1->key_for_search.offset < ref2->key_for_search.offset)
+		return -1;
+	if (ref1->key_for_search.offset > ref2->key_for_search.offset)
+		return 1;
+	if (ref1->parent < ref2->parent)
+		return -1;
+	if (ref1->parent > ref2->parent)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Add @newref to the @root rbtree, merging identical refs.
+ *
+ * Callers should assumed that newref has been freed after calling.
+ */
+static void prelim_ref_insert(struct preftree *preftree,
+			      struct prelim_ref *newref)
+{
+	struct rb_root *root;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct prelim_ref *ref;
+	int result;
+
+	root = &preftree->root;
+	p = &root->rb_node;
+
+	while (*p) {
+		parent = *p;
+		ref = rb_entry(parent, struct prelim_ref, rbnode);
+		result = prelim_ref_compare(ref, newref);
+		if (result < 0) {
+			p = &(*p)->rb_left;
+		} else if (result > 0) {
+			p = &(*p)->rb_right;
+		} else {
+			/* Identical refs, merge them and free @newref */
+			struct extent_inode_elem *eie = ref->inode_list;
+
+			while (eie && eie->next)
+				eie = eie->next;
+
+			if (!eie)
+				ref->inode_list = newref->inode_list;
+			else
+				eie->next = newref->inode_list;
+			ref->count += newref->count;
+			release_pref(newref);
+			return;
+		}
+	}
+
+	rb_link_node(&newref->rbnode, parent, p);
+	rb_insert_color(&newref->rbnode, root);
+}
+
+/*
+ * Release the entire tree.  We don't care about internal consistency so
+ * just free everything and then reset the tree root.
+ */
+static void prelim_release(struct preftree *preftree)
+{
+	struct prelim_ref *ref, *next_ref;
+
+	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
+					     rbnode)
+		release_pref(ref);
+
+	preftree->root = RB_ROOT;
+}
+
 /*
  * the rules for all callers of this function are:
  * - obtaining the parent is the goal
@@ -196,7 +309,7 @@  void btrfs_prelim_ref_exit(void)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct list_head *head, u64 root_id,
+static int add_prelim_ref(struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -243,11 +356,29 @@  static int add_prelim_ref(struct list_head *head, u64 root_id,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	list_add_tail(&ref->list, head);
+	prelim_ref_insert(preftree, ref);
 
 	return 0;
 }
 
+/* direct refs use root == 0, key == NULL */
+static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
+/* indirect refs use parent == 0 */
+static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+			    const struct btrfs_key *key, int level,
+			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	return add_prelim_ref(&preftrees->indirect, root_id, key, level, 0,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
+
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
@@ -429,38 +560,58 @@  unode_aux_to_inode_list(struct ulist_node *node)
 }
 
 /*
- * resolve all indirect backrefs from the list
+ * We maintain two separate rbtrees: one for indirect refs and one for
+ * direct refs. Each tree does merge on insertion.  Once all of the
+ * refs have been located, we iterate over the indirect ref tree, resolve
+ * each reference and remove it from the indirect tree, and then insert
+ * the resolved reference into the direct tree, merging there too.
+ *
+ * New backrefs (i.e., for parent nodes) are added to the direct/indirect
+ * rbtrees as they are encountered.  The new, indirect backrefs are
+ * resolved as above.
  */
 static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
-				 struct list_head *head,
+				 struct preftrees *preftrees,
 				 const u64 *extent_item_pos, u64 total_refs,
 				 u64 root_objectid)
 {
 	int err;
 	int ret = 0;
-	struct prelim_ref *ref;
-	struct prelim_ref *ref_safe;
-	struct prelim_ref *new_ref;
 	struct ulist *parents;
 	struct ulist_node *node;
 	struct ulist_iterator uiter;
+	struct rb_node *rnode;
 
 	parents = ulist_alloc(GFP_NOFS);
 	if (!parents)
 		return -ENOMEM;
 
 	/*
-	 * _safe allows us to insert directly after the current item without
-	 * iterating over the newly inserted items.
-	 * we're also allowed to re-assign ref during iteration.
+	 * We could trade memory usage for performance here by iterating
+	 * the tree, allocating new refs for each insertion, and then
+	 * freeing the entire indirect tree when we're done.  In some test
+	 * cases, the tree can grow quite large (~200k objects).
 	 */
-	list_for_each_entry_safe(ref, ref_safe, head, list) {
-		if (ref->parent)	/* already direct */
-			continue;
-		if (ref->count == 0)
+	while ((rnode = rb_first(&preftrees->indirect.root))) {
+		struct prelim_ref *ref;
+
+		ref = rb_entry(rnode, struct prelim_ref, rbnode);
+		if (WARN(ref->parent,
+			 "BUG: direct ref found in indirect tree")) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+
+		if (ref->count == 0) {
+			release_pref(ref);
 			continue;
+		}
+
 		if (root_objectid && ref->root_id != root_objectid) {
+			release_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
 		}
@@ -472,8 +623,11 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
+			/* This only happens when we're doing sanity testing */
+			release_pref(ref);
 			continue;
 		} else if (err) {
+			release_pref(ref);
 			ret = err;
 			goto out;
 		}
@@ -484,19 +638,26 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ref->parent = node ? node->val : 0;
 		ref->inode_list = unode_aux_to_inode_list(node);
 
-		/* additional parents require new refs being added here */
+		/* Add a prelim_ref(s) for any other parent(s). */
 		while ((node = ulist_next(parents, &uiter))) {
+			struct prelim_ref *new_ref;
+
 			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
 						   GFP_NOFS);
 			if (!new_ref) {
+				release_pref(ref);
 				ret = -ENOMEM;
 				goto out;
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			list_add(&new_ref->list, &ref->list);
+			prelim_ref_insert(&preftrees->direct, new_ref);
 		}
+
+		/* Now it's a direct ref, put it in the the direct tree */
+		prelim_ref_insert(&preftrees->direct, ref);
+
 		ulist_reinit(parents);
 	}
 out:
@@ -504,37 +665,22 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static inline int ref_for_same_block(struct prelim_ref *ref1,
-				     struct prelim_ref *ref2)
-{
-	if (ref1->level != ref2->level)
-		return 0;
-	if (ref1->root_id != ref2->root_id)
-		return 0;
-	if (ref1->key_for_search.type != ref2->key_for_search.type)
-		return 0;
-	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
-		return 0;
-	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
-		return 0;
-	if (ref1->parent != ref2->parent)
-		return 0;
-
-	return 1;
-}
-
 /*
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct list_head *head)
+			    struct preftrees *preftrees)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
+	struct rb_node *node = rb_first(&preftrees->indirect.root);
+
+	while (node) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		node = rb_next(&ref->rbnode);
+		if (WARN(ref->parent, "BUG: direct ref found in indirect tree"))
+			return -EINVAL;
 
-	list_for_each_entry(ref, head, list) {
-		if (ref->parent)
-			continue;
 		if (ref->key_for_search.type)
 			continue;
 		BUG_ON(!ref->wanted_disk_byte);
@@ -557,57 +703,11 @@  static int add_missing_keys(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * merge backrefs and adjust counts accordingly
- *
- *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
- *           then we can merge more here. Additionally, we could even add a key
- *           range for the blocks we looked into to merge even more (-> replace
- *           unresolved refs by those having a parent).
- */
-static void merge_refs(struct list_head *head, enum merge_mode mode)
-{
-	struct prelim_ref *pos1;
-
-	list_for_each_entry(pos1, head, list) {
-		struct prelim_ref *pos2 = pos1, *tmp;
-
-		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
-			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
-			struct extent_inode_elem *eie;
-
-			if (!ref_for_same_block(ref1, ref2))
-				continue;
-			if (mode == MERGE_IDENTICAL_KEYS) {
-				if (!ref1->parent && ref2->parent)
-					swap(ref1, ref2);
-			} else {
-				if (ref1->parent != ref2->parent)
-					continue;
-			}
-
-			eie = ref1->inode_list;
-			while (eie && eie->next)
-				eie = eie->next;
-			if (eie)
-				eie->next = ref2->inode_list;
-			else
-				ref1->inode_list = ref2->inode_list;
-			ref1->count += ref2->count;
-
-			list_del(&ref2->list);
-			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
-			cond_resched();
-		}
-
-	}
-}
-
-/*
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
 static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			    struct list_head *prefs, u64 *total_refs,
+			    struct preftrees *preftrees, u64 *total_refs,
 			    u64 inum)
 {
 	struct btrfs_delayed_ref_node *node;
@@ -642,24 +742,30 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
+			/* NORMAL INDIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, ref->root, &op_key,
-					     ref->level + 1, 0, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &op_key,
+					       ref->level + 1, node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			/* SHARED DIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
+
+			ret = add_direct_ref(preftrees, ref->level + 1,
 					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+					     node->ref_mod * sgn,
+					     GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_delayed_data_ref *ref;
 			ref = btrfs_delayed_node_to_data_ref(node);
 
@@ -676,17 +782,21 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
-					     node->bytenr, node->ref_mod * sgn,
-					     GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_delayed_data_ref *ref;
 
 			ref = btrfs_delayed_node_to_data_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
-					     node->bytenr, node->ref_mod * sgn,
+
+			ret = add_direct_ref(preftrees, 0, ref->parent,
+					     node->bytenr,
+					     node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
 		}
@@ -704,7 +814,7 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  * add all inline backrefs for bytenr to the list
  */
 static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
-			   int *info_level, struct list_head *prefs,
+			   int *info_level, struct preftrees *preftrees,
 			   u64 *total_refs, u64 inum)
 {
 	int ret = 0;
@@ -760,8 +870,8 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
-					     offset, bytenr, 1, GFP_NOFS);
+			ret = add_direct_ref(preftrees, *info_level + 1, offset,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -769,14 +879,15 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
+
+			ret = add_direct_ref(preftrees, 0, offset,
 					     bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, offset, NULL,
-					     *info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, offset, NULL,
+					       *info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -796,8 +907,9 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -816,7 +928,8 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
  */
 static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			  struct btrfs_path *path, u64 bytenr,
-			  int info_level, struct list_head *prefs, u64 inum)
+			  int info_level, struct preftrees *preftrees,
+			  u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -846,26 +959,31 @@  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
-					     key.offset, bytenr, 1, GFP_NOFS);
+			/* SHARED DIRECT METADATA backref */
+			ret = add_direct_ref(preftrees, info_level + 1,
+					     key.offset, bytenr, 1,
+					     GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_shared_data_ref *sdref;
 			int count;
 
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-					     bytenr, count, GFP_NOFS);
+			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
+					     count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, key.offset, NULL,
-					     info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			/* NORMAL INDIRECT METADATA backref */
+			ret = add_indirect_ref(preftrees, key.offset, NULL,
+					       info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_extent_data_ref *dref;
 			int count;
 			u64 root;
@@ -884,8 +1002,8 @@  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -926,14 +1044,15 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *head;
 	int info_level = 0;
 	int ret;
-	struct list_head prefs_delayed;
-	struct list_head prefs;
 	struct prelim_ref *ref;
+	struct rb_node *node;
 	struct extent_inode_elem *eie = NULL;
+	/* total of both direct AND indirect refs! */
 	u64 total_refs = 0;
-
-	INIT_LIST_HEAD(&prefs);
-	INIT_LIST_HEAD(&prefs_delayed);
+	struct preftrees preftrees = {
+		.direct = PREFTREE_INIT,
+		.indirect = PREFTREE_INIT
+	};
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
@@ -996,15 +1115,18 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(head, time_seq,
-					       &prefs_delayed, &total_refs,
-					       inum);
+			ret = add_delayed_refs(head, time_seq, &preftrees,
+					       &total_refs, inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
 		} else {
 			spin_unlock(&delayed_refs->lock);
 		}
+
+		/*
+		 * TODO - implement short circuit for BACKREF_FOUND_SHARED
+		 */
 	}
 
 	if (path->slots[0]) {
@@ -1019,35 +1141,41 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = add_inline_refs(path, bytenr, &info_level,
-					      &prefs, &total_refs, inum);
+					      &preftrees, &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
-					     &prefs, inum);
+					     &preftrees, inum);
 			if (ret)
 				goto out;
 		}
 	}
-	btrfs_release_path(path);
 
-	list_splice_init(&prefs_delayed, &prefs);
+	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &prefs);
+	ret = add_missing_keys(fs_info, &preftrees);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
-
-	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
 				    extent_item_pos, total_refs,
 				    root_objectid);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
 
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
+	/*
+	 * This walks the tree of merged and resolved refs. Tree blocks are
+	 * read in as needed. Unique entries are added to the ulist, and
+	 * the list of found roots is updated.
+	 *
+	 * We release the entire tree in one go before returning.
+	 */
+	node = rb_first(&preftrees.direct.root);
+	while (node) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		node = rb_next(&ref->rbnode);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			if (root_objectid && ref->root_id != root_objectid) {
@@ -1101,23 +1229,14 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			}
 			eie = NULL;
 		}
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 
 out:
 	btrfs_free_path(path);
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
-	while (!list_empty(&prefs_delayed)) {
-		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
-				       list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
+
+	prelim_release(&preftrees.direct);
+	prelim_release(&preftrees.indirect);
+
 	if (ret < 0)
 		free_inode_elem_list(eie);
 	return ret;