diff mbox

[v2,08/13] btrfs: convert prelimary reference tracking to use rbtrees

Message ID 20170629035705.1589-9-enadolski@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Edmund Nadolski June 29, 2017, 3:57 a.m. UTC
It's been known for a while that the use of multiple lists
that are periodically merged was an algorithmic problem within
btrfs.  There are several workloads that don't complete in any
reasonable amount of time (e.g. btrfs/130) and others that cause
soft lockups.

The solution is to use a pair of rbtrees that do insertion merging
for both indirect and direct refs, with the former converting
refs into the latter.  The result is a btrfs/130 workload that
used to take several hours now takes about half of that. This
runtime still isn't acceptable and a future patch will address that
by moving the rbtrees higher in the stack so the lookups can be
shared across multiple calls to find_parent_nodes.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/btrfs/backref.c | 437 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 280 insertions(+), 157 deletions(-)

Comments

David Sterba July 10, 2017, 4:59 p.m. UTC | #1
On Wed, Jun 28, 2017 at 09:57:00PM -0600, Edmund Nadolski wrote:
> It's been known for a while that the use of multiple lists
> that are periodically merged was an algorithmic problem within
> btrfs.  There are several workloads that don't complete in any
> reasonable amount of time (e.g. btrfs/130) and others that cause
> soft lockups.
> 
> The solution is to use a pair of rbtrees that do insertion merging

You've added third rbtree, so the changelog should be updated.

> for both indirect and direct refs, with the former converting
> refs into the latter.  The result is a btrfs/130 workload that
> used to take several hours now takes about half of that. This
> runtime still isn't acceptable and a future patch will address that
> by moving the rbtrees higher in the stack so the lookups can be
> shared across multiple calls to find_parent_nodes.
> 
> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> ---
>  fs/btrfs/backref.c | 437 ++++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 280 insertions(+), 157 deletions(-)
> 
> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> index 6cac5ab..ebe8875 100644
> --- a/fs/btrfs/backref.c
> +++ b/fs/btrfs/backref.c
> @@ -26,11 +26,6 @@
>  #include "delayed-ref.h"
>  #include "locking.h"
>  
> -enum merge_mode {
> -	MERGE_IDENTICAL_KEYS = 1,
> -	MERGE_IDENTICAL_PARENTS,
> -};
> -
>  /* Just an arbitrary number so we can be sure this happened */
>  #define BACKREF_FOUND_SHARED 6
>  
> @@ -129,7 +124,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
>   * this structure records all encountered refs on the way up to the root
>   */
>  struct prelim_ref {
> -	struct list_head list;
> +	struct rb_node rbnode;
>  	u64 root_id;
>  	struct btrfs_key key_for_search;
>  	int level;
> @@ -139,6 +134,18 @@ struct prelim_ref {
>  	u64 wanted_disk_byte;
>  };
>  
> +struct preftree {
> +	struct rb_root root;
> +};
> +
> +#define PREFTREE_INIT	{ .root = RB_ROOT }
> +
> +struct preftrees {
> +	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
> +	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
> +	struct preftree indirect_missing_keys;
> +};
> +
>  static struct kmem_cache *btrfs_prelim_ref_cache;
>  
>  int __init btrfs_prelim_ref_init(void)
> @@ -158,6 +165,108 @@ void btrfs_prelim_ref_exit(void)
>  	kmem_cache_destroy(btrfs_prelim_ref_cache);
>  }
>  
> +static void free_pref(struct prelim_ref *ref)
> +{
> +	kmem_cache_free(btrfs_prelim_ref_cache, ref);
> +}
> +
> +/*
> + * Return 0 when both refs are for the same block (and can be merged).
> + * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
> + * indicates a 'higher' block.
> + */
> +static int prelim_ref_compare(struct prelim_ref *ref1,
> +			      struct prelim_ref *ref2)
> +{
> +	if (ref1->level < ref2->level)
> +		return -1;
> +	if (ref1->level > ref2->level)
> +		return 1;
> +	if (ref1->root_id < ref2->root_id)
> +		return -1;
> +	if (ref1->root_id > ref2->root_id)
> +		return 1;
> +	if (ref1->key_for_search.type < ref2->key_for_search.type)
> +		return -1;
> +	if (ref1->key_for_search.type > ref2->key_for_search.type)
> +		return 1;
> +	if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
> +		return -1;
> +	if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
> +		return 1;
> +	if (ref1->key_for_search.offset < ref2->key_for_search.offset)
> +		return -1;
> +	if (ref1->key_for_search.offset > ref2->key_for_search.offset)
> +		return 1;
> +	if (ref1->parent < ref2->parent)
> +		return -1;
> +	if (ref1->parent > ref2->parent)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * Add @newref to the @root rbtree, merging identical refs.
> + *
> + * Callers should assumed that newref has been freed after calling.
> + */
> +static void prelim_ref_insert(struct preftree *preftree,
> +			      struct prelim_ref *newref)
> +{
> +	struct rb_root *root;
> +	struct rb_node **p;
> +	struct rb_node *parent = NULL;
> +	struct prelim_ref *ref;
> +	int result;
> +
> +	root = &preftree->root;
> +	p = &root->rb_node;
> +
> +	while (*p) {
> +		parent = *p;
> +		ref = rb_entry(parent, struct prelim_ref, rbnode);
> +		result = prelim_ref_compare(ref, newref);
> +		if (result < 0) {
> +			p = &(*p)->rb_left;
> +		} else if (result > 0) {
> +			p = &(*p)->rb_right;
> +		} else {
> +			/* Identical refs, merge them and free @newref */
> +			struct extent_inode_elem *eie = ref->inode_list;
> +
> +			while (eie && eie->next)
> +				eie = eie->next;
> +
> +			if (!eie)
> +				ref->inode_list = newref->inode_list;
> +			else
> +				eie->next = newref->inode_list;
> +			ref->count += newref->count;
> +			free_pref(newref);
> +			return;
> +		}
> +	}
> +
> +	rb_link_node(&newref->rbnode, parent, p);
> +	rb_insert_color(&newref->rbnode, root);
> +}
> +
> +/*
> + * Release the entire tree.  We don't care about internal consistency so
> + * just free everything and then reset the tree root.
> + */
> +static void prelim_release(struct preftree *preftree)
> +{
> +	struct prelim_ref *ref, *next_ref;
> +
> +	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
> +					     rbnode)
> +		free_pref(ref);
> +
> +	preftree->root = RB_ROOT;
> +}
> +
>  /*
>   * the rules for all callers of this function are:
>   * - obtaining the parent is the goal
> @@ -196,7 +305,7 @@ void btrfs_prelim_ref_exit(void)
>   * additional information that's available but not required to find the parent
>   * block might help in merging entries to gain some speed.
>   */
> -static int add_prelim_ref(struct list_head *head, u64 root_id,
> +static int add_prelim_ref(struct preftree *preftree, u64 root_id,
>  			  const struct btrfs_key *key, int level, u64 parent,
>  			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
>  {
> @@ -243,11 +352,31 @@ static int add_prelim_ref(struct list_head *head, u64 root_id,
>  	ref->count = count;
>  	ref->parent = parent;
>  	ref->wanted_disk_byte = wanted_disk_byte;
> -	list_add_tail(&ref->list, head);
> +	prelim_ref_insert(preftree, ref);
>  
>  	return 0;
>  }
>  
> +/* direct refs use root == 0, key == NULL */
> +static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
> +			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
> +{
> +	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
> +			      wanted_disk_byte, count, gfp_mask);
> +}
> +
> +/* indirect refs use parent == 0 */
> +static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
> +			    const struct btrfs_key *key, int level,
> +			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
> +{
> +	struct preftree *tree = &preftrees->indirect;
> +	if (!key)
> +		tree = &preftrees->indirect_missing_keys;
> +	return add_prelim_ref(tree, root_id, key, level, 0,
> +			      wanted_disk_byte, count, gfp_mask);
> +}
> +
>  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
>  			   struct ulist *parents, struct prelim_ref *ref,
>  			   int level, u64 time_seq, const u64 *extent_item_pos,
> @@ -429,38 +558,58 @@ unode_aux_to_inode_list(struct ulist_node *node)
>  }
>  
>  /*
> - * resolve all indirect backrefs from the list
> + * We maintain two separate rbtrees: one for indirect refs and one for

As it's 'three' now, this needs to be updated as well.

> + * direct refs. Each tree does merge on insertion.  Once all of the
> + * refs have been located, we iterate over the indirect ref tree, resolve
> + * each reference and remove it from the indirect tree, and then insert
> + * the resolved reference into the direct tree, merging there too.
> + *
> + * New backrefs (i.e., for parent nodes) are added to the direct/indirect
> + * rbtrees as they are encountered.  The new, indirect backrefs are
> + * resolved as above.
>   */
>  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  				 struct btrfs_path *path, u64 time_seq,
> -				 struct list_head *head,
> +				 struct preftrees *preftrees,
>  				 const u64 *extent_item_pos, u64 total_refs,
>  				 u64 root_objectid)
>  {
>  	int err;
>  	int ret = 0;
> -	struct prelim_ref *ref;
> -	struct prelim_ref *ref_safe;
> -	struct prelim_ref *new_ref;
>  	struct ulist *parents;
>  	struct ulist_node *node;
>  	struct ulist_iterator uiter;
> +	struct rb_node *rnode;
>  
>  	parents = ulist_alloc(GFP_NOFS);
>  	if (!parents)
>  		return -ENOMEM;
>  
>  	/*
> -	 * _safe allows us to insert directly after the current item without
> -	 * iterating over the newly inserted items.
> -	 * we're also allowed to re-assign ref during iteration.
> +	 * We could trade memory usage for performance here by iterating
> +	 * the tree, allocating new refs for each insertion, and then
> +	 * freeing the entire indirect tree when we're done.  In some test
> +	 * cases, the tree can grow quite large (~200k objects).
>  	 */
> -	list_for_each_entry_safe(ref, ref_safe, head, list) {
> -		if (ref->parent)	/* already direct */
> -			continue;
> -		if (ref->count == 0)
> +	while ((rnode = rb_first(&preftrees->indirect.root))) {
> +		struct prelim_ref *ref;
> +
> +		ref = rb_entry(rnode, struct prelim_ref, rbnode);
> +		if (WARN(ref->parent,
> +			 "BUG: direct ref found in indirect tree")) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		rb_erase(&ref->rbnode, &preftrees->indirect.root);
> +
> +		if (ref->count == 0) {
> +			free_pref(ref);
>  			continue;
> +		}
> +
>  		if (root_objectid && ref->root_id != root_objectid) {
> +			free_pref(ref);
>  			ret = BACKREF_FOUND_SHARED;
>  			goto out;
>  		}
> @@ -472,8 +621,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  		 * and return directly.
>  		 */
>  		if (err == -ENOENT) {
> +			/* This only happens when we're doing sanity testing */
> +			free_pref(ref);
>  			continue;
>  		} else if (err) {
> +			free_pref(ref);
>  			ret = err;
>  			goto out;
>  		}
> @@ -484,19 +636,26 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  		ref->parent = node ? node->val : 0;
>  		ref->inode_list = unode_aux_to_inode_list(node);
>  
> -		/* additional parents require new refs being added here */
> +		/* Add a prelim_ref(s) for any other parent(s). */
>  		while ((node = ulist_next(parents, &uiter))) {
> +			struct prelim_ref *new_ref;
> +
>  			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
>  						   GFP_NOFS);
>  			if (!new_ref) {
> +				free_pref(ref);
>  				ret = -ENOMEM;
>  				goto out;
>  			}
>  			memcpy(new_ref, ref, sizeof(*ref));
>  			new_ref->parent = node->val;
>  			new_ref->inode_list = unode_aux_to_inode_list(node);
> -			list_add(&new_ref->list, &ref->list);
> +			prelim_ref_insert(&preftrees->direct, new_ref);
>  		}
> +
> +		/* Now it's a direct ref, put it in the the direct tree */
> +		prelim_ref_insert(&preftrees->direct, ref);
> +
>  		ulist_reinit(parents);
>  	}
>  out:
> @@ -504,44 +663,31 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
>  	return ret;
>  }
>  
> -static inline int ref_for_same_block(struct prelim_ref *ref1,
> -				     struct prelim_ref *ref2)
> -{
> -	if (ref1->level != ref2->level)
> -		return 0;
> -	if (ref1->root_id != ref2->root_id)
> -		return 0;
> -	if (ref1->key_for_search.type != ref2->key_for_search.type)
> -		return 0;
> -	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
> -		return 0;
> -	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
> -		return 0;
> -	if (ref1->parent != ref2->parent)
> -		return 0;
> -
> -	return 1;
> -}
> -
>  /*
>   * read tree blocks and add keys where required.
>   */
>  static int add_missing_keys(struct btrfs_fs_info *fs_info,
> -			    struct list_head *head)
> +			    struct preftrees *preftrees)
>  {
>  	struct prelim_ref *ref;
>  	struct extent_buffer *eb;
> +	struct preftree *tree = &preftrees->indirect_missing_keys;
> +	struct rb_node *node;
>  
> -	list_for_each_entry(ref, head, list) {
> -		if (ref->parent)
> -			continue;
> -		if (ref->key_for_search.type)
> -			continue;
> +	while ((node = rb_first(&tree->root))) {
> +		ref = rb_entry(node, struct prelim_ref, rbnode);
> +		rb_erase(node, &tree->root);
> +
> +		BUG_ON(ref->parent);	/* should not be a direct ref */
> +		BUG_ON(ref->key_for_search.type);
>  		BUG_ON(!ref->wanted_disk_byte);
> +
>  		eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
>  		if (IS_ERR(eb)) {
> +			free_pref(ref);
>  			return PTR_ERR(eb);
>  		} else if (!extent_buffer_uptodate(eb)) {
> +			free_pref(ref);
>  			free_extent_buffer(eb);
>  			return -EIO;
>  		}
> @@ -552,73 +698,31 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
>  			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
>  		btrfs_tree_read_unlock(eb);
>  		free_extent_buffer(eb);
> +		prelim_ref_insert(&preftrees->indirect, ref);
>  	}
>  	return 0;
>  }
>  
>  /*
> - * merge backrefs and adjust counts accordingly
> - *
> - *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
> - *           then we can merge more here. Additionally, we could even add a key
> - *           range for the blocks we looked into to merge even more (-> replace
> - *           unresolved refs by those having a parent).
> - */
> -static void merge_refs(struct list_head *head, enum merge_mode mode)
> -{
> -	struct prelim_ref *pos1;
> -
> -	list_for_each_entry(pos1, head, list) {
> -		struct prelim_ref *pos2 = pos1, *tmp;
> -
> -		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
> -			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
> -			struct extent_inode_elem *eie;
> -
> -			if (!ref_for_same_block(ref1, ref2))
> -				continue;
> -			if (mode == MERGE_IDENTICAL_KEYS) {
> -				if (!ref1->parent && ref2->parent)
> -					swap(ref1, ref2);
> -			} else {
> -				if (ref1->parent != ref2->parent)
> -					continue;
> -			}
> -
> -			eie = ref1->inode_list;
> -			while (eie && eie->next)
> -				eie = eie->next;
> -			if (eie)
> -				eie->next = ref2->inode_list;
> -			else
> -				ref1->inode_list = ref2->inode_list;
> -			ref1->count += ref2->count;
> -
> -			list_del(&ref2->list);
> -			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
> -			cond_resched();
> -		}
> -
> -	}
> -}
> -
> -/*
>   * add all currently queued delayed refs from this head whose seq nr is
>   * smaller or equal that seq to the list
>   */
>  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
> -			    struct list_head *prefs, u64 *total_refs,
> +			    struct preftrees *preftrees, u64 *total_refs,
>  			    u64 inum)
>  {
>  	struct btrfs_delayed_ref_node *node;
>  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
>  	struct btrfs_key key;
> -	struct btrfs_key op_key = {0};
> +	struct btrfs_key tmp_op_key;
> +	struct btrfs_key *op_key = NULL;
>  	int sgn;
>  	int ret = 0;
>  
> -	if (extent_op && extent_op->update_key)
> -		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
> +	if (extent_op && extent_op->update_key) {
> +		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
> +		op_key = &tmp_op_key;
> +	}
>  
>  	spin_lock(&head->lock);
>  	list_for_each_entry(node, &head->ref_list, list) {
> @@ -642,24 +746,30 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>  		*total_refs += (node->ref_mod * sgn);
>  		switch (node->type) {
>  		case BTRFS_TREE_BLOCK_REF_KEY: {
> +			/* NORMAL INDIRECT METADATA backref */
>  			struct btrfs_delayed_tree_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_tree_ref(node);
> -			ret = add_prelim_ref(prefs, ref->root, &op_key,
> -					     ref->level + 1, 0, node->bytenr,
> -					     node->ref_mod * sgn, GFP_ATOMIC);
> +			ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key,
> +					       ref->level + 1, node->bytenr,
> +					       node->ref_mod * sgn,
> +					       GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_SHARED_BLOCK_REF_KEY: {
> +			/* SHARED DIRECT METADATA backref */
>  			struct btrfs_delayed_tree_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_tree_ref(node);
> -			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
> +
> +			ret = add_direct_ref(preftrees, ref->level + 1,
>  					     ref->parent, node->bytenr,
> -					     node->ref_mod * sgn, GFP_ATOMIC);
> +					     node->ref_mod * sgn,
> +					     GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
> +			/* NORMAL INDIRECT DATA backref */
>  			struct btrfs_delayed_data_ref *ref;
>  			ref = btrfs_delayed_node_to_data_ref(node);
>  
> @@ -676,17 +786,21 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>  				break;
>  			}
>  
> -			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
> -					     node->bytenr, node->ref_mod * sgn,
> -					     GFP_ATOMIC);
> +			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
> +					       node->bytenr,
> +					       node->ref_mod * sgn,
> +					       GFP_ATOMIC);
>  			break;
>  		}
>  		case BTRFS_SHARED_DATA_REF_KEY: {
> +			/* SHARED DIRECT FULL backref */
>  			struct btrfs_delayed_data_ref *ref;
>  
>  			ref = btrfs_delayed_node_to_data_ref(node);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
> -					     node->bytenr, node->ref_mod * sgn,
> +
> +			ret = add_direct_ref(preftrees, 0, ref->parent,
> +					     node->bytenr,
> +					     node->ref_mod * sgn,
>  					     GFP_ATOMIC);
>  			break;
>  		}
> @@ -704,7 +818,7 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
>   * add all inline backrefs for bytenr to the list
>   */
>  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
> -			   int *info_level, struct list_head *prefs,
> +			   int *info_level, struct preftrees *preftrees,
>  			   u64 *total_refs, u64 inum)
>  {
>  	int ret = 0;
> @@ -760,8 +874,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  
>  		switch (type) {
>  		case BTRFS_SHARED_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
> -					     offset, bytenr, 1, GFP_NOFS);
> +			ret = add_direct_ref(preftrees, *info_level + 1, offset,
> +					     bytenr, 1, GFP_NOFS);
>  			break;
>  		case BTRFS_SHARED_DATA_REF_KEY: {
>  			struct btrfs_shared_data_ref *sdref;
> @@ -769,14 +883,15 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  
>  			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
>  			count = btrfs_shared_data_ref_count(leaf, sdref);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
> +
> +			ret = add_direct_ref(preftrees, 0, offset,
>  					     bytenr, count, GFP_NOFS);
>  			break;
>  		}
>  		case BTRFS_TREE_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, offset, NULL,
> -					     *info_level + 1, 0,
> -					     bytenr, 1, GFP_NOFS);
> +			ret = add_indirect_ref(preftrees, offset, NULL,
> +					       *info_level + 1, bytenr, 1,
> +					       GFP_NOFS);
>  			break;
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
>  			struct btrfs_extent_data_ref *dref;
> @@ -796,8 +911,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>  			}
>  
>  			root = btrfs_extent_data_ref_root(leaf, dref);
> -			ret = add_prelim_ref(prefs, root, &key, 0, 0,
> -					     bytenr, count, GFP_NOFS);
> +
> +			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
> +					       count, GFP_NOFS);
>  			break;
>  		}
>  		default:
> @@ -816,7 +932,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
>   */
>  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  			  struct btrfs_path *path, u64 bytenr,
> -			  int info_level, struct list_head *prefs, u64 inum)
> +			  int info_level, struct preftrees *preftrees,
> +			  u64 inum)
>  {
>  	struct btrfs_root *extent_root = fs_info->extent_root;
>  	int ret;
> @@ -846,26 +963,31 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  
>  		switch (key.type) {
>  		case BTRFS_SHARED_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
> -					     key.offset, bytenr, 1, GFP_NOFS);
> +			/* SHARED DIRECT METADATA backref */
> +			ret = add_direct_ref(preftrees, info_level + 1,
> +					     key.offset, bytenr, 1,
> +					     GFP_NOFS);
>  			break;
>  		case BTRFS_SHARED_DATA_REF_KEY: {
> +			/* SHARED DIRECT FULL backref */
>  			struct btrfs_shared_data_ref *sdref;
>  			int count;
>  
>  			sdref = btrfs_item_ptr(leaf, slot,
>  					      struct btrfs_shared_data_ref);
>  			count = btrfs_shared_data_ref_count(leaf, sdref);
> -			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
> -					     bytenr, count, GFP_NOFS);
> +			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
> +					     count, GFP_NOFS);
>  			break;
>  		}
>  		case BTRFS_TREE_BLOCK_REF_KEY:
> -			ret = add_prelim_ref(prefs, key.offset, NULL,
> -					     info_level + 1, 0,
> -					     bytenr, 1, GFP_NOFS);
> +			/* NORMAL INDIRECT METADATA backref */
> +			ret = add_indirect_ref(preftrees, key.offset, NULL,
> +					       info_level + 1, bytenr, 1,
> +					       GFP_NOFS);
>  			break;
>  		case BTRFS_EXTENT_DATA_REF_KEY: {
> +			/* NORMAL INDIRECT DATA backref */
>  			struct btrfs_extent_data_ref *dref;
>  			int count;
>  			u64 root;
> @@ -884,8 +1006,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
>  			}
>  
>  			root = btrfs_extent_data_ref_root(leaf, dref);
> -			ret = add_prelim_ref(prefs, root, &key, 0, 0,
> -					     bytenr, count, GFP_NOFS);
> +			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
> +					       count, GFP_NOFS);
>  			break;
>  		}
>  		default:
> @@ -926,14 +1048,16 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  	struct btrfs_delayed_ref_head *head;
>  	int info_level = 0;
>  	int ret;
> -	struct list_head prefs_delayed;
> -	struct list_head prefs;
>  	struct prelim_ref *ref;
> +	struct rb_node *node;
>  	struct extent_inode_elem *eie = NULL;
> +	/* total of both direct AND indirect refs! */
>  	u64 total_refs = 0;
> -
> -	INIT_LIST_HEAD(&prefs);
> -	INIT_LIST_HEAD(&prefs_delayed);
> +	struct preftrees preftrees = {
> +		.direct = PREFTREE_INIT,
> +		.indirect = PREFTREE_INIT,
> +		.indirect_missing_keys = PREFTREE_INIT
> +	};
>  
>  	key.objectid = bytenr;
>  	key.offset = (u64)-1;
> @@ -996,9 +1120,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  				goto again;
>  			}
>  			spin_unlock(&delayed_refs->lock);
> -			ret = add_delayed_refs(head, time_seq,
> -					       &prefs_delayed, &total_refs,
> -					       inum);
> +			ret = add_delayed_refs(head, time_seq, &preftrees,
> +					       &total_refs, inum);
>  			mutex_unlock(&head->mutex);
>  			if (ret)
>  				goto out;
> @@ -1019,35 +1142,43 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
>  		     key.type == BTRFS_METADATA_ITEM_KEY)) {
>  			ret = add_inline_refs(path, bytenr, &info_level,
> -					      &prefs, &total_refs, inum);
> +					      &preftrees, &total_refs, inum);
>  			if (ret)
>  				goto out;
>  			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
> -					     &prefs, inum);
> +					     &preftrees, inum);
>  			if (ret)
>  				goto out;
>  		}
>  	}
> -	btrfs_release_path(path);
>  
> -	list_splice_init(&prefs_delayed, &prefs);
> +	btrfs_release_path(path);
>  
> -	ret = add_missing_keys(fs_info, &prefs);
> +	ret = add_missing_keys(fs_info, &preftrees);
>  	if (ret)
>  		goto out;
>  
> -	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
> +	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
>  
> -	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
> +	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
>  				    extent_item_pos, total_refs,
>  				    root_objectid);
>  	if (ret)
>  		goto out;
>  
> -	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
> +	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
>  
> -	while (!list_empty(&prefs)) {
> -		ref = list_first_entry(&prefs, struct prelim_ref, list);
> +	/*
> +	 * This walks the tree of merged and resolved refs. Tree blocks are
> +	 * read in as needed. Unique entries are added to the ulist, and
> +	 * the list of found roots is updated.
> +	 *
> +	 * We release the entire tree in one go before returning.
> +	 */
> +	node = rb_first(&preftrees.direct.root);
> +	while (node) {
> +		ref = rb_entry(node, struct prelim_ref, rbnode);
> +		node = rb_next(&ref->rbnode);
>  		WARN_ON(ref->count < 0);
>  		if (roots && ref->count && ref->root_id && ref->parent == 0) {
>  			if (root_objectid && ref->root_id != root_objectid) {
> @@ -1101,23 +1232,15 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
>  			}
>  			eie = NULL;
>  		}
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
>  	}
>  
>  out:
>  	btrfs_free_path(path);
> -	while (!list_empty(&prefs)) {
> -		ref = list_first_entry(&prefs, struct prelim_ref, list);
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
> -	}
> -	while (!list_empty(&prefs_delayed)) {
> -		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
> -				       list);
> -		list_del(&ref->list);
> -		kmem_cache_free(btrfs_prelim_ref_cache, ref);
> -	}
> +
> +	prelim_release(&preftrees.direct);
> +	prelim_release(&preftrees.indirect);
> +	prelim_release(&preftrees.indirect_missing_keys);
> +
>  	if (ret < 0)
>  		free_inode_elem_list(eie);
>  	return ret;
> -- 
> 2.10.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba July 11, 2017, 3:15 p.m. UTC | #2
On Wed, Jun 28, 2017 at 09:57:00PM -0600, Edmund Nadolski wrote:
> It's been known for a while that the use of multiple lists
> that are periodically merged was an algorithmic problem within
> btrfs.  There are several workloads that don't complete in any
> reasonable amount of time (e.g. btrfs/130) and others that cause
> soft lockups.
> 
> The solution is to use a pair of rbtrees that do insertion merging
> for both indirect and direct refs, with the former converting
> refs into the latter.  The result is a btrfs/130 workload that
> used to take several hours now takes about half of that. This
> runtime still isn't acceptable and a future patch will address that
> by moving the rbtrees higher in the stack so the lookups can be
> shared across multiple calls to find_parent_nodes.
> 
> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> Signed-off-by: Jeff Mahoney <jeffm@suse.com>

I've bisected to this patch, the self-tests run at module load time
fail:

tests/qgroup-tests.c:272

270         if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
271                                 nodesize, nodesize)) {
272                 test_msg("Qgroup counts didn't match expected values\n");
273                 return -EINVAL;
274         }

 245 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 246                                u64 rfer, u64 excl)
 247 {
 248         struct btrfs_qgroup *qgroup;
 249
 250         qgroup = find_qgroup_rb(fs_info, qgroupid);
 251         if (!qgroup)
 252                 return -EINVAL;
 253         if (qgroup->rfer != rfer || qgroup->excl != excl)
 254                 return -EINVAL;
 255         return 0;
 256 }

the second if fails, with 0 != 4096 || 0 != 4096

Tested branch was current for-next-test (top commit
8d73f8348287a3d3be10795f45d313f63cdcd72c), with
CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Edmund Nadolski July 11, 2017, 11:12 p.m. UTC | #3
On 07/11/2017 09:15 AM, David Sterba wrote:
> On Wed, Jun 28, 2017 at 09:57:00PM -0600, Edmund Nadolski wrote:
>> It's been known for a while that the use of multiple lists
>> that are periodically merged was an algorithmic problem within
>> btrfs.  There are several workloads that don't complete in any
>> reasonable amount of time (e.g. btrfs/130) and others that cause
>> soft lockups.
>>
>> The solution is to use a pair of rbtrees that do insertion merging
>> for both indirect and direct refs, with the former converting
>> refs into the latter.  The result is a btrfs/130 workload that
>> used to take several hours now takes about half of that. This
>> runtime still isn't acceptable and a future patch will address that
>> by moving the rbtrees higher in the stack so the lookups can be
>> shared across multiple calls to find_parent_nodes.
>>
>> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
>> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> 
> I've bisected to this patch, the self-tests run at module load time
> fail:
> 
> tests/qgroup-tests.c:272
> 
> 270         if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
> 271                                 nodesize, nodesize)) {
> 272                 test_msg("Qgroup counts didn't match expected values\n");
> 273                 return -EINVAL;
> 274         }
> 
>  245 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
>  246                                u64 rfer, u64 excl)
>  247 {
>  248         struct btrfs_qgroup *qgroup;
>  249
>  250         qgroup = find_qgroup_rb(fs_info, qgroupid);
>  251         if (!qgroup)
>  252                 return -EINVAL;
>  253         if (qgroup->rfer != rfer || qgroup->excl != excl)
>  254                 return -EINVAL;
>  255         return 0;
>  256 }
> 
> the second if fails, with 0 != 4096 || 0 != 4096
> 
> Tested branch was current for-next-test (top commit
> 8d73f8348287a3d3be10795f45d313f63cdcd72c), with
> CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y

This looks like a consequence of an existing check in __resolve_indirect_ref():

	if (btrfs_is_testing(fs_info)) {
		srcu_read_unlock(&fs_info->subvol_srcu, index);
		ret = -ENOENT;
		goto out;
	}

The existing code simply leaves the ref on the pref list, to be picked up later
in find_parent_nodes(), which will ulist_add() an entry onto the roots list for
it.  The patch otoh when it sees -ENOENT just frees the ref so no entry is
ever added to the ulist.

The patch can be fixed to behave similarly to the existing code by
inserting the ref into the direct tree instead of freeing it.  This seems
a bit odd since technically the ref isn't actually 'resolved'. Considering
that this code path is really just a special case for the sanity check when
the fs_info is in a BTRFS_FS_STATE_DUMMY_FS_INFO state, perhaps that's not
too great a concern. Thoughts?

Thanks,
Ed


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba July 12, 2017, 3:15 p.m. UTC | #4
On Tue, Jul 11, 2017 at 05:12:27PM -0600, Edmund Nadolski wrote:
> 
> 
> On 07/11/2017 09:15 AM, David Sterba wrote:
> > On Wed, Jun 28, 2017 at 09:57:00PM -0600, Edmund Nadolski wrote:
> >> It's been known for a while that the use of multiple lists
> >> that are periodically merged was an algorithmic problem within
> >> btrfs.  There are several workloads that don't complete in any
> >> reasonable amount of time (e.g. btrfs/130) and others that cause
> >> soft lockups.
> >>
> >> The solution is to use a pair of rbtrees that do insertion merging
> >> for both indirect and direct refs, with the former converting
> >> refs into the latter.  The result is a btrfs/130 workload that
> >> used to take several hours now takes about half of that. This
> >> runtime still isn't acceptable and a future patch will address that
> >> by moving the rbtrees higher in the stack so the lookups can be
> >> shared across multiple calls to find_parent_nodes.
> >>
> >> Signed-off-by: Edmund Nadolski <enadolski@suse.com>
> >> Signed-off-by: Jeff Mahoney <jeffm@suse.com>
> > 
> > I've bisected to this patch, the self-tests run at module load time
> > fail:
> > 
> > tests/qgroup-tests.c:272
> > 
> > 270         if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
> > 271                                 nodesize, nodesize)) {
> > 272                 test_msg("Qgroup counts didn't match expected values\n");
> > 273                 return -EINVAL;
> > 274         }
> > 
> >  245 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
> >  246                                u64 rfer, u64 excl)
> >  247 {
> >  248         struct btrfs_qgroup *qgroup;
> >  249
> >  250         qgroup = find_qgroup_rb(fs_info, qgroupid);
> >  251         if (!qgroup)
> >  252                 return -EINVAL;
> >  253         if (qgroup->rfer != rfer || qgroup->excl != excl)
> >  254                 return -EINVAL;
> >  255         return 0;
> >  256 }
> > 
> > the second if fails, with 0 != 4096 || 0 != 4096
> > 
> > Tested branch was current for-next-test (top commit
> > 8d73f8348287a3d3be10795f45d313f63cdcd72c), with
> > CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y
> 
> This looks like a consequence of an existing check in __resolve_indirect_ref():
> 
> 	if (btrfs_is_testing(fs_info)) {
> 		srcu_read_unlock(&fs_info->subvol_srcu, index);
> 		ret = -ENOENT;
> 		goto out;
> 	}
> 
> The existing code simply leaves the ref on the pref list, to be picked up later
> in find_parent_nodes(), which will ulist_add() an entry onto the roots list for
> it.  The patch otoh when it sees -ENOENT just frees the ref so no entry is
> ever added to the ulist.
> 
> The patch can be fixed to behave similarly to the existing code by
> inserting the ref into the direct tree instead of freeing it.  This seems
> a bit odd since technically the ref isn't actually 'resolved'. Considering
> that this code path is really just a special case for the sanity check when
> the fs_info is in a BTRFS_FS_STATE_DUMMY_FS_INFO state, perhaps that's not
> too great a concern. Thoughts?

Yeah, this has been introduced by d9ee522ba3ab51b7e3c6d and it wants to
take some shortcuts for the self-tests. I'm concerned because the module
does not load when the self-tests are enabled.

So at this moment I'd take simpler approach and work around it so we can
continue testing and then fix it properly (either populate the trees or
add more exceptions for the self-tests).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6cac5ab..ebe8875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,11 +26,6 @@ 
 #include "delayed-ref.h"
 #include "locking.h"
 
-enum merge_mode {
-	MERGE_IDENTICAL_KEYS = 1,
-	MERGE_IDENTICAL_PARENTS,
-};
-
 /* Just an arbitrary number so we can be sure this happened */
 #define BACKREF_FOUND_SHARED 6
 
@@ -129,7 +124,7 @@  static int find_extent_in_eb(const struct extent_buffer *eb,
  * this structure records all encountered refs on the way up to the root
  */
 struct prelim_ref {
-	struct list_head list;
+	struct rb_node rbnode;
 	u64 root_id;
 	struct btrfs_key key_for_search;
 	int level;
@@ -139,6 +134,18 @@  struct prelim_ref {
 	u64 wanted_disk_byte;
 };
 
+struct preftree {
+	struct rb_root root;
+};
+
+#define PREFTREE_INIT	{ .root = RB_ROOT }
+
+struct preftrees {
+	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
+	struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
+	struct preftree indirect_missing_keys;
+};
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -158,6 +165,108 @@  void btrfs_prelim_ref_exit(void)
 	kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 
+static void free_pref(struct prelim_ref *ref)
+{
+	kmem_cache_free(btrfs_prelim_ref_cache, ref);
+}
+
+/*
+ * Return 0 when both refs are for the same block (and can be merged).
+ * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
+ * indicates a 'higher' block.
+ */
+static int prelim_ref_compare(struct prelim_ref *ref1,
+			      struct prelim_ref *ref2)
+{
+	if (ref1->level < ref2->level)
+		return -1;
+	if (ref1->level > ref2->level)
+		return 1;
+	if (ref1->root_id < ref2->root_id)
+		return -1;
+	if (ref1->root_id > ref2->root_id)
+		return 1;
+	if (ref1->key_for_search.type < ref2->key_for_search.type)
+		return -1;
+	if (ref1->key_for_search.type > ref2->key_for_search.type)
+		return 1;
+	if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
+		return -1;
+	if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
+		return 1;
+	if (ref1->key_for_search.offset < ref2->key_for_search.offset)
+		return -1;
+	if (ref1->key_for_search.offset > ref2->key_for_search.offset)
+		return 1;
+	if (ref1->parent < ref2->parent)
+		return -1;
+	if (ref1->parent > ref2->parent)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Add @newref to the @root rbtree, merging identical refs.
+ *
+ * Callers should assumed that newref has been freed after calling.
+ */
+static void prelim_ref_insert(struct preftree *preftree,
+			      struct prelim_ref *newref)
+{
+	struct rb_root *root;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct prelim_ref *ref;
+	int result;
+
+	root = &preftree->root;
+	p = &root->rb_node;
+
+	while (*p) {
+		parent = *p;
+		ref = rb_entry(parent, struct prelim_ref, rbnode);
+		result = prelim_ref_compare(ref, newref);
+		if (result < 0) {
+			p = &(*p)->rb_left;
+		} else if (result > 0) {
+			p = &(*p)->rb_right;
+		} else {
+			/* Identical refs, merge them and free @newref */
+			struct extent_inode_elem *eie = ref->inode_list;
+
+			while (eie && eie->next)
+				eie = eie->next;
+
+			if (!eie)
+				ref->inode_list = newref->inode_list;
+			else
+				eie->next = newref->inode_list;
+			ref->count += newref->count;
+			free_pref(newref);
+			return;
+		}
+	}
+
+	rb_link_node(&newref->rbnode, parent, p);
+	rb_insert_color(&newref->rbnode, root);
+}
+
+/*
+ * Release the entire tree.  We don't care about internal consistency so
+ * just free everything and then reset the tree root.
+ */
+static void prelim_release(struct preftree *preftree)
+{
+	struct prelim_ref *ref, *next_ref;
+
+	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
+					     rbnode)
+		free_pref(ref);
+
+	preftree->root = RB_ROOT;
+}
+
 /*
  * the rules for all callers of this function are:
  * - obtaining the parent is the goal
@@ -196,7 +305,7 @@  void btrfs_prelim_ref_exit(void)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct list_head *head, u64 root_id,
+static int add_prelim_ref(struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -243,11 +352,31 @@  static int add_prelim_ref(struct list_head *head, u64 root_id,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	list_add_tail(&ref->list, head);
+	prelim_ref_insert(preftree, ref);
 
 	return 0;
 }
 
+/* direct refs use root == 0, key == NULL */
+static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
+/* indirect refs use parent == 0 */
+static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+			    const struct btrfs_key *key, int level,
+			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+{
+	struct preftree *tree = &preftrees->indirect;
+	if (!key)
+		tree = &preftrees->indirect_missing_keys;
+	return add_prelim_ref(tree, root_id, key, level, 0,
+			      wanted_disk_byte, count, gfp_mask);
+}
+
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
@@ -429,38 +558,58 @@  unode_aux_to_inode_list(struct ulist_node *node)
 }
 
 /*
- * resolve all indirect backrefs from the list
+ * We maintain two separate rbtrees: one for indirect refs and one for
+ * direct refs. Each tree does merge on insertion.  Once all of the
+ * refs have been located, we iterate over the indirect ref tree, resolve
+ * each reference and remove it from the indirect tree, and then insert
+ * the resolved reference into the direct tree, merging there too.
+ *
+ * New backrefs (i.e., for parent nodes) are added to the direct/indirect
+ * rbtrees as they are encountered.  The new, indirect backrefs are
+ * resolved as above.
  */
 static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				 struct btrfs_path *path, u64 time_seq,
-				 struct list_head *head,
+				 struct preftrees *preftrees,
 				 const u64 *extent_item_pos, u64 total_refs,
 				 u64 root_objectid)
 {
 	int err;
 	int ret = 0;
-	struct prelim_ref *ref;
-	struct prelim_ref *ref_safe;
-	struct prelim_ref *new_ref;
 	struct ulist *parents;
 	struct ulist_node *node;
 	struct ulist_iterator uiter;
+	struct rb_node *rnode;
 
 	parents = ulist_alloc(GFP_NOFS);
 	if (!parents)
 		return -ENOMEM;
 
 	/*
-	 * _safe allows us to insert directly after the current item without
-	 * iterating over the newly inserted items.
-	 * we're also allowed to re-assign ref during iteration.
+	 * We could trade memory usage for performance here by iterating
+	 * the tree, allocating new refs for each insertion, and then
+	 * freeing the entire indirect tree when we're done.  In some test
+	 * cases, the tree can grow quite large (~200k objects).
 	 */
-	list_for_each_entry_safe(ref, ref_safe, head, list) {
-		if (ref->parent)	/* already direct */
-			continue;
-		if (ref->count == 0)
+	while ((rnode = rb_first(&preftrees->indirect.root))) {
+		struct prelim_ref *ref;
+
+		ref = rb_entry(rnode, struct prelim_ref, rbnode);
+		if (WARN(ref->parent,
+			 "BUG: direct ref found in indirect tree")) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		rb_erase(&ref->rbnode, &preftrees->indirect.root);
+
+		if (ref->count == 0) {
+			free_pref(ref);
 			continue;
+		}
+
 		if (root_objectid && ref->root_id != root_objectid) {
+			free_pref(ref);
 			ret = BACKREF_FOUND_SHARED;
 			goto out;
 		}
@@ -472,8 +621,11 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
+			/* This only happens when we're doing sanity testing */
+			free_pref(ref);
 			continue;
 		} else if (err) {
+			free_pref(ref);
 			ret = err;
 			goto out;
 		}
@@ -484,19 +636,26 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ref->parent = node ? node->val : 0;
 		ref->inode_list = unode_aux_to_inode_list(node);
 
-		/* additional parents require new refs being added here */
+		/* Add a prelim_ref(s) for any other parent(s). */
 		while ((node = ulist_next(parents, &uiter))) {
+			struct prelim_ref *new_ref;
+
 			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
 						   GFP_NOFS);
 			if (!new_ref) {
+				free_pref(ref);
 				ret = -ENOMEM;
 				goto out;
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			list_add(&new_ref->list, &ref->list);
+			prelim_ref_insert(&preftrees->direct, new_ref);
 		}
+
+		/* Now it's a direct ref, put it in the the direct tree */
+		prelim_ref_insert(&preftrees->direct, ref);
+
 		ulist_reinit(parents);
 	}
 out:
@@ -504,44 +663,31 @@  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static inline int ref_for_same_block(struct prelim_ref *ref1,
-				     struct prelim_ref *ref2)
-{
-	if (ref1->level != ref2->level)
-		return 0;
-	if (ref1->root_id != ref2->root_id)
-		return 0;
-	if (ref1->key_for_search.type != ref2->key_for_search.type)
-		return 0;
-	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
-		return 0;
-	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
-		return 0;
-	if (ref1->parent != ref2->parent)
-		return 0;
-
-	return 1;
-}
-
 /*
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct list_head *head)
+			    struct preftrees *preftrees)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
+	struct preftree *tree = &preftrees->indirect_missing_keys;
+	struct rb_node *node;
 
-	list_for_each_entry(ref, head, list) {
-		if (ref->parent)
-			continue;
-		if (ref->key_for_search.type)
-			continue;
+	while ((node = rb_first(&tree->root))) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		rb_erase(node, &tree->root);
+
+		BUG_ON(ref->parent);	/* should not be a direct ref */
+		BUG_ON(ref->key_for_search.type);
 		BUG_ON(!ref->wanted_disk_byte);
+
 		eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
 		if (IS_ERR(eb)) {
+			free_pref(ref);
 			return PTR_ERR(eb);
 		} else if (!extent_buffer_uptodate(eb)) {
+			free_pref(ref);
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -552,73 +698,31 @@  static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
+		prelim_ref_insert(&preftrees->indirect, ref);
 	}
 	return 0;
 }
 
 /*
- * merge backrefs and adjust counts accordingly
- *
- *    FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in add_prelim_ref
- *           then we can merge more here. Additionally, we could even add a key
- *           range for the blocks we looked into to merge even more (-> replace
- *           unresolved refs by those having a parent).
- */
-static void merge_refs(struct list_head *head, enum merge_mode mode)
-{
-	struct prelim_ref *pos1;
-
-	list_for_each_entry(pos1, head, list) {
-		struct prelim_ref *pos2 = pos1, *tmp;
-
-		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
-			struct prelim_ref *ref1 = pos1, *ref2 = pos2;
-			struct extent_inode_elem *eie;
-
-			if (!ref_for_same_block(ref1, ref2))
-				continue;
-			if (mode == MERGE_IDENTICAL_KEYS) {
-				if (!ref1->parent && ref2->parent)
-					swap(ref1, ref2);
-			} else {
-				if (ref1->parent != ref2->parent)
-					continue;
-			}
-
-			eie = ref1->inode_list;
-			while (eie && eie->next)
-				eie = eie->next;
-			if (eie)
-				eie->next = ref2->inode_list;
-			else
-				ref1->inode_list = ref2->inode_list;
-			ref1->count += ref2->count;
-
-			list_del(&ref2->list);
-			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
-			cond_resched();
-		}
-
-	}
-}
-
-/*
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
 static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			    struct list_head *prefs, u64 *total_refs,
+			    struct preftrees *preftrees, u64 *total_refs,
 			    u64 inum)
 {
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct btrfs_key key;
-	struct btrfs_key op_key = {0};
+	struct btrfs_key tmp_op_key;
+	struct btrfs_key *op_key = NULL;
 	int sgn;
 	int ret = 0;
 
-	if (extent_op && extent_op->update_key)
-		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+	if (extent_op && extent_op->update_key) {
+		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
+		op_key = &tmp_op_key;
+	}
 
 	spin_lock(&head->lock);
 	list_for_each_entry(node, &head->ref_list, list) {
@@ -642,24 +746,30 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
+			/* NORMAL INDIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, ref->root, &op_key,
-					     ref->level + 1, 0, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key,
+					       ref->level + 1, node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			/* SHARED DIRECT METADATA backref */
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, ref->level + 1,
+
+			ret = add_direct_ref(preftrees, ref->level + 1,
 					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn, GFP_ATOMIC);
+					     node->ref_mod * sgn,
+					     GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_delayed_data_ref *ref;
 			ref = btrfs_delayed_node_to_data_ref(node);
 
@@ -676,17 +786,21 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = add_prelim_ref(prefs, ref->root, &key, 0, 0,
-					     node->bytenr, node->ref_mod * sgn,
-					     GFP_ATOMIC);
+			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn,
+					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_delayed_data_ref *ref;
 
 			ref = btrfs_delayed_node_to_data_ref(node);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, ref->parent,
-					     node->bytenr, node->ref_mod * sgn,
+
+			ret = add_direct_ref(preftrees, 0, ref->parent,
+					     node->bytenr,
+					     node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
 		}
@@ -704,7 +818,7 @@  static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  * add all inline backrefs for bytenr to the list
  */
 static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
-			   int *info_level, struct list_head *prefs,
+			   int *info_level, struct preftrees *preftrees,
 			   u64 *total_refs, u64 inum)
 {
 	int ret = 0;
@@ -760,8 +874,8 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, *info_level + 1,
-					     offset, bytenr, 1, GFP_NOFS);
+			ret = add_direct_ref(preftrees, *info_level + 1, offset,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -769,14 +883,15 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, offset,
+
+			ret = add_direct_ref(preftrees, 0, offset,
 					     bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, offset, NULL,
-					     *info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, offset, NULL,
+					       *info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -796,8 +911,9 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -816,7 +932,8 @@  static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
  */
 static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			  struct btrfs_path *path, u64 bytenr,
-			  int info_level, struct list_head *prefs, u64 inum)
+			  int info_level, struct preftrees *preftrees,
+			  u64 inum)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
@@ -846,26 +963,31 @@  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, 0, NULL, info_level + 1,
-					     key.offset, bytenr, 1, GFP_NOFS);
+			/* SHARED DIRECT METADATA backref */
+			ret = add_direct_ref(preftrees, info_level + 1,
+					     key.offset, bytenr, 1,
+					     GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
+			/* SHARED DIRECT FULL backref */
 			struct btrfs_shared_data_ref *sdref;
 			int count;
 
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-					     bytenr, count, GFP_NOFS);
+			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
+					     count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_prelim_ref(prefs, key.offset, NULL,
-					     info_level + 1, 0,
-					     bytenr, 1, GFP_NOFS);
+			/* NORMAL INDIRECT METADATA backref */
+			ret = add_indirect_ref(preftrees, key.offset, NULL,
+					       info_level + 1, bytenr, 1,
+					       GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
+			/* NORMAL INDIRECT DATA backref */
 			struct btrfs_extent_data_ref *dref;
 			int count;
 			u64 root;
@@ -884,8 +1006,8 @@  static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_prelim_ref(prefs, root, &key, 0, 0,
-					     bytenr, count, GFP_NOFS);
+			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
+					       count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -926,14 +1048,16 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *head;
 	int info_level = 0;
 	int ret;
-	struct list_head prefs_delayed;
-	struct list_head prefs;
 	struct prelim_ref *ref;
+	struct rb_node *node;
 	struct extent_inode_elem *eie = NULL;
+	/* total of both direct AND indirect refs! */
 	u64 total_refs = 0;
-
-	INIT_LIST_HEAD(&prefs);
-	INIT_LIST_HEAD(&prefs_delayed);
+	struct preftrees preftrees = {
+		.direct = PREFTREE_INIT,
+		.indirect = PREFTREE_INIT,
+		.indirect_missing_keys = PREFTREE_INIT
+	};
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
@@ -996,9 +1120,8 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(head, time_seq,
-					       &prefs_delayed, &total_refs,
-					       inum);
+			ret = add_delayed_refs(head, time_seq, &preftrees,
+					       &total_refs, inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1019,35 +1142,43 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = add_inline_refs(path, bytenr, &info_level,
-					      &prefs, &total_refs, inum);
+					      &preftrees, &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
-					     &prefs, inum);
+					     &preftrees, inum);
 			if (ret)
 				goto out;
 		}
 	}
-	btrfs_release_path(path);
 
-	list_splice_init(&prefs_delayed, &prefs);
+	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &prefs);
+	ret = add_missing_keys(fs_info, &preftrees);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
 
-	ret = resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
 				    extent_item_pos, total_refs,
 				    root_objectid);
 	if (ret)
 		goto out;
 
-	merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
+	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
 
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
+	/*
+	 * This walks the tree of merged and resolved refs. Tree blocks are
+	 * read in as needed. Unique entries are added to the ulist, and
+	 * the list of found roots is updated.
+	 *
+	 * We release the entire tree in one go before returning.
+	 */
+	node = rb_first(&preftrees.direct.root);
+	while (node) {
+		ref = rb_entry(node, struct prelim_ref, rbnode);
+		node = rb_next(&ref->rbnode);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			if (root_objectid && ref->root_id != root_objectid) {
@@ -1101,23 +1232,15 @@  static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			}
 			eie = NULL;
 		}
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 
 out:
 	btrfs_free_path(path);
-	while (!list_empty(&prefs)) {
-		ref = list_first_entry(&prefs, struct prelim_ref, list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
-	while (!list_empty(&prefs_delayed)) {
-		ref = list_first_entry(&prefs_delayed, struct prelim_ref,
-				       list);
-		list_del(&ref->list);
-		kmem_cache_free(btrfs_prelim_ref_cache, ref);
-	}
+
+	prelim_release(&preftrees.direct);
+	prelim_release(&preftrees.indirect);
+	prelim_release(&preftrees.indirect_missing_keys);
+
 	if (ret < 0)
 		free_inode_elem_list(eie);
 	return ret;