diff mbox series

[v3,07/12] btrfs: replace pending/pinned chunks lists with io tree

Message ID 20190325123132.27835-8-nborisov@suse.com (mailing list archive)
State New, archived
Headers show
Series FITRIM improvements | expand

Commit Message

Nikolay Borisov March 25, 2019, 12:31 p.m. UTC
From: Jeff Mahoney <jeffm@suse.com>

The pending chunks list contains chunks that are allocated in the
current transaction but haven't been created yet. The pinned chunks
list contains chunks that are being released in the current transaction.
Both describe chunks that are not reflected on disk as in use but are
unavailable just the same.

The pending chunks list is anchored by the transaction handle, which
means that we need to hold a reference to a transaction when working
with the list.

We use these lists to ensure that we don't end up discarding chunks
that are allocated or released in the current transaction.  What we r

The way we use them is by iterating over both lists to perform
comparisons on the stripes they describe for each device. This is
backwards and requires that we keep a transaction handle open while
we're trimming.

This patchset adds an extent_io_tree to btrfs_device that maintains
the allocation state of the device.  Extents are set dirty when
chunks are first allocated -- when the extent maps are added to the
mapping tree. They're cleared when last removed -- when the extent
maps are removed from the mapping tree. This matches the lifespan
of the pending and pinned chunks list and allows us to do trims
on unallocated space safely without pinning the transaction for what
may be a lengthy operation. We can also use this io tree to mark
which chunks have already been trimmed so we don't repeat the operation.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/ctree.h            |  6 ---
 fs/btrfs/disk-io.c          | 11 -----
 fs/btrfs/extent-tree.c      | 28 -----------
 fs/btrfs/extent_io.c        |  2 +-
 fs/btrfs/extent_io.h        |  6 ++-
 fs/btrfs/extent_map.c       | 36 ++++++++++++++
 fs/btrfs/extent_map.h       |  1 -
 fs/btrfs/free-space-cache.c |  4 --
 fs/btrfs/transaction.c      |  9 ----
 fs/btrfs/transaction.h      |  1 -
 fs/btrfs/volumes.c          | 96 +++++++++++++------------------------
 fs/btrfs/volumes.h          |  2 +
 12 files changed, 76 insertions(+), 126 deletions(-)

Comments

David Sterba March 25, 2019, 2:22 p.m. UTC | #1
On Mon, Mar 25, 2019 at 02:31:27PM +0200, Nikolay Borisov wrote:
> From: Jeff Mahoney <jeffm@suse.com>
> 
> The pending chunks list contains chunks that are allocated in the
> current transaction but haven't been created yet. The pinned chunks
> list contains chunks that are being released in the current transaction.
> Both describe chunks that are not reflected on disk as in use but are
> unavailable just the same.
> 
> The pending chunks list is anchored by the transaction handle, which
> means that we need to hold a reference to a transaction when working
> with the list.
> 
> We use these lists to ensure that we don't end up discarding chunks
> that are allocated or released in the current transaction.  What we r

unfinished sentence?

> 
> The way we use them is by iterating over both lists to perform
> comparisons on the stripes they describe for each device. This is
> backwards and requires that we keep a transaction handle open while
> we're trimming.
> 
> This patchset adds an extent_io_tree to btrfs_device that maintains
> the allocation state of the device.  Extents are set dirty when
> chunks are first allocated -- when the extent maps are added to the
> mapping tree. They're cleared when last removed -- when the extent
> maps are removed from the mapping tree. This matches the lifespan
> of the pending and pinned chunks list and allows us to do trims
> on unallocated space safely without pinning the transaction for what
> may be a lengthy operation. We can also use this io tree to mark
> which chunks have already been trimmed so we don't repeat the operation.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> ---
>  fs/btrfs/ctree.h            |  6 ---
>  fs/btrfs/disk-io.c          | 11 -----
>  fs/btrfs/extent-tree.c      | 28 -----------
>  fs/btrfs/extent_io.c        |  2 +-
>  fs/btrfs/extent_io.h        |  6 ++-
>  fs/btrfs/extent_map.c       | 36 ++++++++++++++
>  fs/btrfs/extent_map.h       |  1 -
>  fs/btrfs/free-space-cache.c |  4 --
>  fs/btrfs/transaction.c      |  9 ----
>  fs/btrfs/transaction.h      |  1 -
>  fs/btrfs/volumes.c          | 96 +++++++++++++------------------------
>  fs/btrfs/volumes.h          |  2 +
>  12 files changed, 76 insertions(+), 126 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 880b5abd8ecc..0b25c2f1b77d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1152,12 +1152,6 @@ struct btrfs_fs_info {
>  	struct mutex unused_bg_unpin_mutex;
>  	struct mutex delete_unused_bgs_mutex;
>  
> -	/*
> -	 * Chunks that can't be freed yet (under a trim/discard operation)
> -	 * and will be latter freed. Protected by fs_info->chunk_mutex.
> -	 */
> -	struct list_head pinned_chunks;
> -
>  	/* Cached block sizes */
>  	u32 nodesize;
>  	u32 sectorsize;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 81183694f8ea..813fc21b2b0b 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2789,8 +2789,6 @@ int open_ctree(struct super_block *sb,
>  	init_waitqueue_head(&fs_info->async_submit_wait);
>  	init_waitqueue_head(&fs_info->delayed_iputs_wait);
>  
> -	INIT_LIST_HEAD(&fs_info->pinned_chunks);
> -
>  	/* Usable values until the real ones are cached from the superblock */
>  	fs_info->nodesize = 4096;
>  	fs_info->sectorsize = 4096;
> @@ -4065,15 +4063,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>  
>  	btrfs_free_stripe_hash_table(fs_info);
>  	btrfs_free_ref_cache(fs_info);
> -
> -	while (!list_empty(&fs_info->pinned_chunks)) {
> -		struct extent_map *em;
> -
> -		em = list_first_entry(&fs_info->pinned_chunks,
> -				      struct extent_map, list);
> -		list_del_init(&em->list);
> -		free_extent_map(em);
> -	}
>  }
>  
>  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index dcda3c4de240..eb4b7f2b10a1 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10946,10 +10946,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  	memcpy(&key, &block_group->key, sizeof(key));
>  
>  	mutex_lock(&fs_info->chunk_mutex);
> -	if (!list_empty(&em->list)) {
> -		/* We're in the transaction->pending_chunks list. */
> -		free_extent_map(em);
> -	}
>  	spin_lock(&block_group->lock);
>  	block_group->removed = 1;
>  	/*
> @@ -10976,25 +10972,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  	 * the transaction commit has completed.
>  	 */
>  	remove_em = (atomic_read(&block_group->trimming) == 0);
> -	/*
> -	 * Make sure a trimmer task always sees the em in the pinned_chunks list
> -	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
> -	 * before checking block_group->removed).
> -	 */
> -	if (!remove_em) {
> -		/*
> -		 * Our em might be in trans->transaction->pending_chunks which
> -		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
> -		 * and so is the fs_info->pinned_chunks list.
> -		 *
> -		 * So at this point we must be holding the chunk_mutex to avoid
> -		 * any races with chunk allocation (more specifically at
> -		 * volumes.c:contains_pending_extent()), to ensure it always
> -		 * sees the em, either in the pending_chunks list or in the
> -		 * pinned_chunks list.
> -		 */
> -		list_move_tail(&em->list, &fs_info->pinned_chunks);
> -	}
>  	spin_unlock(&block_group->lock);
>  
>  	if (remove_em) {
> @@ -11002,11 +10979,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  
>  		em_tree = &fs_info->mapping_tree.map_tree;
>  		write_lock(&em_tree->lock);
> -		/*
> -		 * The em might be in the pending_chunks list, so make sure the
> -		 * chunk mutex is locked, since remove_extent_mapping() will
> -		 * delete us from that list.
> -		 */
>  		remove_extent_mapping(em_tree, em);
>  		write_unlock(&em_tree->lock);
>  		/* once for the tree */
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0e953c469224..ff3f8443e180 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -918,7 +918,7 @@ static void cache_state(struct extent_state *state,
>   * [start, end] is inclusive This takes the tree lock.
>   */
>  
> -static int __must_check
> +int __must_check
>  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>  		 unsigned bits, unsigned exclusive_bits,
>  		 u64 *failed_start, struct extent_state **cached_state,
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 0fda249e5982..4bcc203b5431 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -329,7 +329,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
>  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>  		   unsigned bits, u64 *failed_start,
>  		   struct extent_state **cached_state, gfp_t mask);
> -
> +int
> +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
> +		 unsigned bits, unsigned exclusive_bits,
> +		 u64 *failed_start, struct extent_state **cached_state,
> +		 gfp_t mask, struct extent_changeset *changeset);
>  static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
>  		u64 end, unsigned bits)
>  {
> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> index 928f729c55ba..0820f6fcf3a6 100644
> --- a/fs/btrfs/extent_map.c
> +++ b/fs/btrfs/extent_map.c
> @@ -4,6 +4,7 @@
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
>  #include "ctree.h"
> +#include "volumes.h"
>  #include "extent_map.h"
>  #include "compression.h"
>  
> @@ -336,6 +337,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
>  	else
>  		try_merge_map(tree, em);
>  }
> +static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
> +{
> +	struct map_lookup *map = em->map_lookup;
> +	u64 stripe_size = em->orig_block_len;
> +	int i;
> +
> +	for (i = 0; i < map->num_stripes; i++) {
> +		struct btrfs_bio_stripe *stripe = &map->stripes[i];
> +		struct btrfs_device *device = stripe->dev;
> +
> +		__set_extent_bit(&device->alloc_state, stripe->physical,
> +				 stripe->physical + stripe_size - 1, bits,
> +				 0, NULL, NULL, GFP_NOWAIT, NULL);
> +	}
> +}
> +
> +static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
> +{
> +	struct map_lookup *map = em->map_lookup;
> +	u64 stripe_size = em->orig_block_len;
> +	int i;
> +
> +	for (i = 0; i < map->num_stripes; i++) {
> +		struct btrfs_bio_stripe *stripe = &map->stripes[i];
> +		struct btrfs_device *device = stripe->dev;
> +
> +		__clear_extent_bit(&device->alloc_state, stripe->physical,
> +				   stripe->physical + stripe_size - 1, bits,
> +				   0, 0, NULL, GFP_NOWAIT, NULL);
> +	}
> +}
>  
>  /**
>   * add_extent_mapping - add new extent map to the extent tree
> @@ -357,6 +389,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
>  		goto out;
>  
>  	setup_extent_mapping(tree, em, modified);
> +	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
> +		extent_map_device_set_bits(em, CHUNK_ALLOCATED);
>  out:
>  	return ret;
>  }
> @@ -438,6 +472,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
>  	rb_erase_cached(&em->rb_node, &tree->map);
>  	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
>  		list_del_init(&em->list);
> +	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
> +		extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
>  	RB_CLEAR_NODE(&em->rb_node);
>  }
>  
> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> index 473f039fcd7c..72b46833f236 100644
> --- a/fs/btrfs/extent_map.h
> +++ b/fs/btrfs/extent_map.h
> @@ -91,7 +91,6 @@ void replace_extent_mapping(struct extent_map_tree *tree,
>  			    struct extent_map *cur,
>  			    struct extent_map *new,
>  			    int modified);
> -
>  struct extent_map *alloc_extent_map(void);
>  void free_extent_map(struct extent_map *em);
>  int __init extent_map_init(void);
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 74aa552f4793..207fb50dcc7a 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -3366,10 +3366,6 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
>  		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
>  					   1);
>  		BUG_ON(!em); /* logic error, can't happen */
> -		/*
> -		 * remove_extent_mapping() will delete us from the pinned_chunks
> -		 * list, which is protected by the chunk mutex.
> -		 */
>  		remove_extent_mapping(em_tree, em);
>  		write_unlock(&em_tree->lock);
>  		mutex_unlock(&fs_info->chunk_mutex);
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index b32769998bbb..e5404326fc55 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -50,14 +50,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
>  			btrfs_err(transaction->fs_info,
>  				  "pending csums is %llu",
>  				  transaction->delayed_refs.pending_csums);
> -		while (!list_empty(&transaction->pending_chunks)) {
> -			struct extent_map *em;
> -
> -			em = list_first_entry(&transaction->pending_chunks,
> -					      struct extent_map, list);
> -			list_del_init(&em->list);
> -			free_extent_map(em);
> -		}
>  		/*
>  		 * If any block groups are found in ->deleted_bgs then it's
>  		 * because the transaction was aborted and a commit did not
> @@ -235,7 +227,6 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
>  	spin_lock_init(&cur_trans->delayed_refs.lock);
>  
>  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
> -	INIT_LIST_HEAD(&cur_trans->pending_chunks);
>  	INIT_LIST_HEAD(&cur_trans->dev_update_list);
>  	INIT_LIST_HEAD(&cur_trans->switch_commits);
>  	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index 2bd76f681520..4419a4a0294b 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -51,7 +51,6 @@ struct btrfs_transaction {
>  	wait_queue_head_t writer_wait;
>  	wait_queue_head_t commit_wait;
>  	struct list_head pending_snapshots;
> -	struct list_head pending_chunks;
>  	struct list_head dev_update_list;
>  	struct list_head switch_commits;
>  	struct list_head dirty_bgs;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 39a107bdde06..a06f1e3c63b9 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -335,6 +335,8 @@ void btrfs_free_device(struct btrfs_device *device)
>  {
>  	WARN_ON(!list_empty(&device->post_commit_list));
>  	rcu_string_free(device->name);
> +	if (!in_softirq())
> +		extent_io_tree_release(&device->alloc_state);
>  	bio_put(device->flush_bio);
>  	kfree(device);
>  }
> @@ -411,6 +413,7 @@ static struct btrfs_device *__alloc_device(void)
>  	btrfs_device_data_ordered_init(dev);
>  	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
>  	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
> +	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
>  
>  	return dev;
>  }
> @@ -1269,6 +1272,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
>  	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>  		fs_devices->missing_devices--;
>  
> +	/* Remove alloc state now since it cannot be done in RCU context */
> +	extent_io_tree_release(&device->alloc_state);
> +
>  	btrfs_close_bdev(device);
>  
>  	new_device = btrfs_alloc_device(NULL, &device->devid,
> @@ -1505,58 +1511,29 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>  	return device;
>  }
>  
> -static int contains_pending_extent(struct btrfs_transaction *transaction,
> -				   struct btrfs_device *device,
> -				   u64 *start, u64 len)
> -{
> -	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct extent_map *em;
> -	struct list_head *search_list = &fs_info->pinned_chunks;
> -	int ret = 0;
> -	u64 physical_start = *start;
> -
> -	if (transaction)
> -		search_list = &transaction->pending_chunks;
> -again:
> -	list_for_each_entry(em, search_list, list) {
> -		struct map_lookup *map;
> -		int i;
> -
> -		map = em->map_lookup;
> -		for (i = 0; i < map->num_stripes; i++) {
> -			u64 end;
> -
> -			if (map->stripes[i].dev != device)
> -				continue;
> -			if (map->stripes[i].physical >= physical_start + len ||
> -			    map->stripes[i].physical + em->orig_block_len <=
> -			    physical_start)
> -				continue;
> -			/*
> -			 * Make sure that while processing the pinned list we do
> -			 * not override our *start with a lower value, because
> -			 * we can have pinned chunks that fall within this
> -			 * device hole and that have lower physical addresses
> -			 * than the pending chunks we processed before. If we
> -			 * do not take this special care we can end up getting
> -			 * 2 pending chunks that start at the same physical
> -			 * device offsets because the end offset of a pinned
> -			 * chunk can be equal to the start offset of some
> -			 * pending chunk.
> -			 */
> -			end = map->stripes[i].physical + em->orig_block_len;
> -			if (end > *start) {
> -				*start = end;
> -				ret = 1;
> -			}
> +/*
> + * Tries to find a chunk that intersects [start, start +len] range and when one
> + * such is found, records the end of it in *start
> + */
> +#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
> +static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
> +				    u64 len)
> +{
> +	u64 physical_start, physical_end;
> +	lockdep_assert_held(&device->fs_info->chunk_mutex);
> +
> +	if (!find_first_extent_bit(&device->alloc_state, *start,
> +				   &physical_start, &physical_end,
> +				   CHUNK_ALLOCATED, NULL)) {
> +
> +		if (in_range(physical_start, *start, len) ||
> +		    in_range(*start, physical_start,
> +			     physical_end - physical_start)) {
> +			*start = physical_end + 1;
> +			return true;
>  		}
>  	}
> -	if (search_list != &fs_info->pinned_chunks) {
> -		search_list = &fs_info->pinned_chunks;
> -		goto again;
> -	}
> -
> -	return ret;
> +	return false;
>  }
>  
>  
> @@ -1667,15 +1644,12 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>  			 * Have to check before we set max_hole_start, otherwise
>  			 * we could end up sending back this offset anyway.
>  			 */
> -			if (contains_pending_extent(transaction, device,
> -						    &search_start,
> +			if (contains_pending_extent(device, &search_start,
>  						    hole_size)) {
> -				if (key.offset >= search_start) {
> +				if (key.offset >= search_start)
>  					hole_size = key.offset - search_start;
> -				} else {
> -					WARN_ON_ONCE(1);
> +				else
>  					hole_size = 0;
> -				}
>  			}
>  
>  			if (hole_size > max_hole_size) {
> @@ -1716,8 +1690,7 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
>  	if (search_end > search_start) {
>  		hole_size = search_end - search_start;
>  
> -		if (contains_pending_extent(transaction, device, &search_start,
> -					    hole_size)) {
> +		if (contains_pending_extent(device, &search_start, hole_size)) {
>  			btrfs_release_path(path);
>  			goto again;
>  		}
> @@ -4759,7 +4732,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
>  	 * in-memory chunks are synced to disk so that the loop below sees them
>  	 * and relocates them accordingly.
>  	 */
> -	if (contains_pending_extent(trans->transaction, device, &start, diff)) {
> +	if (contains_pending_extent(device, &start, diff)) {
>  		mutex_unlock(&fs_info->chunk_mutex);
>  		ret = btrfs_commit_transaction(trans);
>  		if (ret)
> @@ -5201,9 +5174,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>  		free_extent_map(em);
>  		goto error;
>  	}
> -
> -	list_add_tail(&em->list, &trans->transaction->pending_chunks);
> -	refcount_inc(&em->refs);
>  	write_unlock(&em_tree->lock);
>  
>  	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
> @@ -5236,8 +5206,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>  	free_extent_map(em);
>  	/* One for the tree reference */
>  	free_extent_map(em);
> -	/* One for the pending_chunks list reference */
> -	free_extent_map(em);
>  error:
>  	kfree(devices_info);
>  	return ret;
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index a0f09aad3770..49dc737d8a54 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -134,6 +134,8 @@ struct btrfs_device {
>  	/* Counter to record the change of device stats */
>  	atomic_t dev_stats_ccnt;
>  	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
> +
> +	struct extent_io_tree alloc_state;
>  };
>  
>  /*
> -- 
> 2.17.1
David Sterba March 25, 2019, 4:26 p.m. UTC | #2
On Mon, Mar 25, 2019 at 02:31:27PM +0200, Nikolay Borisov wrote:
> From: Jeff Mahoney <jeffm@suse.com>
> 
> The pending chunks list contains chunks that are allocated in the
> current transaction but haven't been created yet. The pinned chunks
> list contains chunks that are being released in the current transaction.
> Both describe chunks that are not reflected on disk as in use but are
> unavailable just the same.
> 
> The pending chunks list is anchored by the transaction handle, which
> means that we need to hold a reference to a transaction when working
> with the list.
> 
> We use these lists to ensure that we don't end up discarding chunks
> that are allocated or released in the current transaction.  What we r
> 
> The way we use them is by iterating over both lists to perform
> comparisons on the stripes they describe for each device. This is
> backwards and requires that we keep a transaction handle open while
> we're trimming.
> 
> This patchset adds an extent_io_tree to btrfs_device that maintains
> the allocation state of the device.  Extents are set dirty when
> chunks are first allocated -- when the extent maps are added to the
> mapping tree. They're cleared when last removed -- when the extent
> maps are removed from the mapping tree. This matches the lifespan
> of the pending and pinned chunks list and allows us to do trims
> on unallocated space safely without pinning the transaction for what
> may be a lengthy operation. We can also use this io tree to mark
> which chunks have already been trimmed so we don't repeat the operation.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> ---
>  fs/btrfs/ctree.h            |  6 ---
>  fs/btrfs/disk-io.c          | 11 -----
>  fs/btrfs/extent-tree.c      | 28 -----------
>  fs/btrfs/extent_io.c        |  2 +-
>  fs/btrfs/extent_io.h        |  6 ++-
>  fs/btrfs/extent_map.c       | 36 ++++++++++++++
>  fs/btrfs/extent_map.h       |  1 -
>  fs/btrfs/free-space-cache.c |  4 --
>  fs/btrfs/transaction.c      |  9 ----
>  fs/btrfs/transaction.h      |  1 -
>  fs/btrfs/volumes.c          | 96 +++++++++++++------------------------
>  fs/btrfs/volumes.h          |  2 +
>  12 files changed, 76 insertions(+), 126 deletions(-)
> 
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -918,7 +918,7 @@ static void cache_state(struct extent_state *state,
>   * [start, end] is inclusive This takes the tree lock.
>   */
>  
> -static int __must_check
> +int __must_check
>  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>  		 unsigned bits, unsigned exclusive_bits,
>  		 u64 *failed_start, struct extent_state **cached_state,

Does this really need to be exported again? There are helpers that can
wrap specific combinations of parameters.

> @@ -335,6 +335,8 @@ void btrfs_free_device(struct btrfs_device *device)
>  {
>  	WARN_ON(!list_empty(&device->post_commit_list));
>  	rcu_string_free(device->name);
> +	if (!in_softirq())
> +		extent_io_tree_release(&device->alloc_state);

This needs a comment

>  	bio_put(device->flush_bio);
>  	kfree(device);
>  }

The commit is quite big but I don't see how to shrink it, the changes
need to be done in several places. So, probably ok.
Nikolay Borisov March 25, 2019, 4:43 p.m. UTC | #3
On 25.03.19 г. 18:26 ч., David Sterba wrote:
> On Mon, Mar 25, 2019 at 02:31:27PM +0200, Nikolay Borisov wrote:
>> From: Jeff Mahoney <jeffm@suse.com>
>>
>> The pending chunks list contains chunks that are allocated in the
>> current transaction but haven't been created yet. The pinned chunks
>> list contains chunks that are being released in the current transaction.
>> Both describe chunks that are not reflected on disk as in use but are
>> unavailable just the same.
>>
>> The pending chunks list is anchored by the transaction handle, which
>> means that we need to hold a reference to a transaction when working
>> with the list.
>>
>> We use these lists to ensure that we don't end up discarding chunks
>> that are allocated or released in the current transaction.  What we r
>>
>> The way we use them is by iterating over both lists to perform
>> comparisons on the stripes they describe for each device. This is
>> backwards and requires that we keep a transaction handle open while
>> we're trimming.
>>
>> This patchset adds an extent_io_tree to btrfs_device that maintains
>> the allocation state of the device.  Extents are set dirty when
>> chunks are first allocated -- when the extent maps are added to the
>> mapping tree. They're cleared when last removed -- when the extent
>> maps are removed from the mapping tree. This matches the lifespan
>> of the pending and pinned chunks list and allows us to do trims
>> on unallocated space safely without pinning the transaction for what
>> may be a lengthy operation. We can also use this io tree to mark
>> which chunks have already been trimmed so we don't repeat the operation.
>>
>> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
>> ---
>>  fs/btrfs/ctree.h            |  6 ---
>>  fs/btrfs/disk-io.c          | 11 -----
>>  fs/btrfs/extent-tree.c      | 28 -----------
>>  fs/btrfs/extent_io.c        |  2 +-
>>  fs/btrfs/extent_io.h        |  6 ++-
>>  fs/btrfs/extent_map.c       | 36 ++++++++++++++
>>  fs/btrfs/extent_map.h       |  1 -
>>  fs/btrfs/free-space-cache.c |  4 --
>>  fs/btrfs/transaction.c      |  9 ----
>>  fs/btrfs/transaction.h      |  1 -
>>  fs/btrfs/volumes.c          | 96 +++++++++++++------------------------
>>  fs/btrfs/volumes.h          |  2 +
>>  12 files changed, 76 insertions(+), 126 deletions(-)
>>
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -918,7 +918,7 @@ static void cache_state(struct extent_state *state,
>>   * [start, end] is inclusive This takes the tree lock.
>>   */
>>  
>> -static int __must_check
>> +int __must_check
>>  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>>  		 unsigned bits, unsigned exclusive_bits,
>>  		 u64 *failed_start, struct extent_state **cached_state,
> 
> Does this really need to be exported again? There are helpers that can
> wrap specific combinations of parameters.

This is exported so that __set_extent_bit could be called with
GFP_NOWAIT parameter otherwise I was getting lockdep splats since
extent_map_device_set_bits (called from add_extent_mapping) is called
under a write_lock hence we can't sleep.

> 
>> @@ -335,6 +335,8 @@ void btrfs_free_device(struct btrfs_device *device)
>>  {
>>  	WARN_ON(!list_empty(&device->post_commit_list));
>>  	rcu_string_free(device->name);
>> +	if (!in_softirq())
>> +		extent_io_tree_release(&device->alloc_state);

This is used to distinguish between btrfs_free_device being called from
btrfs_close_devices in close_ctree i.e non rcu (hence no softirq )
context or any of the error handlers and from free_device_rcu. In the
latter case the extent tree is already freed in btrfs_close_one_device,
hence there is no need to do it in the RCU callback.

Furthermore, there is also a comment that the extent io tree cannot be
destroyed in RCU context because extent_io_tree_release calls
cond_resched_lock which in turn could sleep, but this is forbidden in
RCU context.

> 
> This needs a comment
> 
>>  	bio_put(device->flush_bio);
>>  	kfree(device);
>>  }
> 
> The commit is quite big but I don't see how to shrink it, the changes
> need to be done in several places. So, probably ok.
>
David Sterba March 25, 2019, 4:57 p.m. UTC | #4
On Mon, Mar 25, 2019 at 06:43:49PM +0200, Nikolay Borisov wrote:
> >> -static int __must_check
> >> +int __must_check
> >>  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
> >>  		 unsigned bits, unsigned exclusive_bits,
> >>  		 u64 *failed_start, struct extent_state **cached_state,
> > 
> > Does this really need to be exported again? There are helpers that can
> > wrap specific combinations of parameters.
> 
> This is exported so that __set_extent_bit could be called with
> GFP_NOWAIT parameter otherwise I was getting lockdep splats since
> extent_map_device_set_bits (called from add_extent_mapping) is called
> under a write_lock hence we can't sleep.

So then we should add a helper for that, like

  int set_exentent_bit_nowait(...)

similar to set_record_extent_bits.

> >> @@ -335,6 +335,8 @@ void btrfs_free_device(struct btrfs_device *device)
> >>  {
> >>  	WARN_ON(!list_empty(&device->post_commit_list));
> >>  	rcu_string_free(device->name);
> >> +	if (!in_softirq())
> >> +		extent_io_tree_release(&device->alloc_state);
> 
> This is used to distinguish between btrfs_free_device being called from
> btrfs_close_devices in close_ctree i.e non rcu (hence no softirq )
> context or any of the error handlers and from free_device_rcu. In the
> latter case the extent tree is already freed in btrfs_close_one_device,
> hence there is no need to do it in the RCU callback.

Ok, I can cook a comment from that, thanks.

> Furthermore, there is also a comment that the extent io tree cannot be
> destroyed in RCU context because extent_io_tree_release calls
> cond_resched_lock which in turn could sleep, but this is forbidden in
> RCU context.

Yeah, comment in the right place.
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 880b5abd8ecc..0b25c2f1b77d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1152,12 +1152,6 @@  struct btrfs_fs_info {
 	struct mutex unused_bg_unpin_mutex;
 	struct mutex delete_unused_bgs_mutex;
 
-	/*
-	 * Chunks that can't be freed yet (under a trim/discard operation)
-	 * and will be latter freed. Protected by fs_info->chunk_mutex.
-	 */
-	struct list_head pinned_chunks;
-
 	/* Cached block sizes */
 	u32 nodesize;
 	u32 sectorsize;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81183694f8ea..813fc21b2b0b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2789,8 +2789,6 @@  int open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->async_submit_wait);
 	init_waitqueue_head(&fs_info->delayed_iputs_wait);
 
-	INIT_LIST_HEAD(&fs_info->pinned_chunks);
-
 	/* Usable values until the real ones are cached from the superblock */
 	fs_info->nodesize = 4096;
 	fs_info->sectorsize = 4096;
@@ -4065,15 +4063,6 @@  void close_ctree(struct btrfs_fs_info *fs_info)
 
 	btrfs_free_stripe_hash_table(fs_info);
 	btrfs_free_ref_cache(fs_info);
-
-	while (!list_empty(&fs_info->pinned_chunks)) {
-		struct extent_map *em;
-
-		em = list_first_entry(&fs_info->pinned_chunks,
-				      struct extent_map, list);
-		list_del_init(&em->list);
-		free_extent_map(em);
-	}
 }
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dcda3c4de240..eb4b7f2b10a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10946,10 +10946,6 @@  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	memcpy(&key, &block_group->key, sizeof(key));
 
 	mutex_lock(&fs_info->chunk_mutex);
-	if (!list_empty(&em->list)) {
-		/* We're in the transaction->pending_chunks list. */
-		free_extent_map(em);
-	}
 	spin_lock(&block_group->lock);
 	block_group->removed = 1;
 	/*
@@ -10976,25 +10972,6 @@  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * the transaction commit has completed.
 	 */
 	remove_em = (atomic_read(&block_group->trimming) == 0);
-	/*
-	 * Make sure a trimmer task always sees the em in the pinned_chunks list
-	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
-	 * before checking block_group->removed).
-	 */
-	if (!remove_em) {
-		/*
-		 * Our em might be in trans->transaction->pending_chunks which
-		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
-		 * and so is the fs_info->pinned_chunks list.
-		 *
-		 * So at this point we must be holding the chunk_mutex to avoid
-		 * any races with chunk allocation (more specifically at
-		 * volumes.c:contains_pending_extent()), to ensure it always
-		 * sees the em, either in the pending_chunks list or in the
-		 * pinned_chunks list.
-		 */
-		list_move_tail(&em->list, &fs_info->pinned_chunks);
-	}
 	spin_unlock(&block_group->lock);
 
 	if (remove_em) {
@@ -11002,11 +10979,6 @@  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 		em_tree = &fs_info->mapping_tree.map_tree;
 		write_lock(&em_tree->lock);
-		/*
-		 * The em might be in the pending_chunks list, so make sure the
-		 * chunk mutex is locked, since remove_extent_mapping() will
-		 * delete us from that list.
-		 */
 		remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 		/* once for the tree */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0e953c469224..ff3f8443e180 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -918,7 +918,7 @@  static void cache_state(struct extent_state *state,
  * [start, end] is inclusive This takes the tree lock.
  */
 
-static int __must_check
+int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		 unsigned bits, unsigned exclusive_bits,
 		 u64 *failed_start, struct extent_state **cached_state,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0fda249e5982..4bcc203b5431 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -329,7 +329,11 @@  int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
-
+int
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		 unsigned bits, unsigned exclusive_bits,
+		 u64 *failed_start, struct extent_state **cached_state,
+		 gfp_t mask, struct extent_changeset *changeset);
 static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
 		u64 end, unsigned bits)
 {
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 928f729c55ba..0820f6fcf3a6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@ 
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include "ctree.h"
+#include "volumes.h"
 #include "extent_map.h"
 #include "compression.h"
 
@@ -336,6 +337,37 @@  static inline void setup_extent_mapping(struct extent_map_tree *tree,
 	else
 		try_merge_map(tree, em);
 }
+static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
+{
+	struct map_lookup *map = em->map_lookup;
+	u64 stripe_size = em->orig_block_len;
+	int i;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		__set_extent_bit(&device->alloc_state, stripe->physical,
+				 stripe->physical + stripe_size - 1, bits,
+				 0, NULL, NULL, GFP_NOWAIT, NULL);
+	}
+}
+
+static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
+{
+	struct map_lookup *map = em->map_lookup;
+	u64 stripe_size = em->orig_block_len;
+	int i;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_bio_stripe *stripe = &map->stripes[i];
+		struct btrfs_device *device = stripe->dev;
+
+		__clear_extent_bit(&device->alloc_state, stripe->physical,
+				   stripe->physical + stripe_size - 1, bits,
+				   0, 0, NULL, GFP_NOWAIT, NULL);
+	}
+}
 
 /**
  * add_extent_mapping - add new extent map to the extent tree
@@ -357,6 +389,8 @@  int add_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 
 	setup_extent_mapping(tree, em, modified);
+	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+		extent_map_device_set_bits(em, CHUNK_ALLOCATED);
 out:
 	return ret;
 }
@@ -438,6 +472,8 @@  void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	rb_erase_cached(&em->rb_node, &tree->map);
 	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
 		list_del_init(&em->list);
+	if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+		extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
 	RB_CLEAR_NODE(&em->rb_node);
 }
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 473f039fcd7c..72b46833f236 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -91,7 +91,6 @@  void replace_extent_mapping(struct extent_map_tree *tree,
 			    struct extent_map *cur,
 			    struct extent_map *new,
 			    int modified);
-
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 74aa552f4793..207fb50dcc7a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3366,10 +3366,6 @@  void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
 		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
 					   1);
 		BUG_ON(!em); /* logic error, can't happen */
-		/*
-		 * remove_extent_mapping() will delete us from the pinned_chunks
-		 * list, which is protected by the chunk mutex.
-		 */
 		remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 		mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b32769998bbb..e5404326fc55 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -50,14 +50,6 @@  void btrfs_put_transaction(struct btrfs_transaction *transaction)
 			btrfs_err(transaction->fs_info,
 				  "pending csums is %llu",
 				  transaction->delayed_refs.pending_csums);
-		while (!list_empty(&transaction->pending_chunks)) {
-			struct extent_map *em;
-
-			em = list_first_entry(&transaction->pending_chunks,
-					      struct extent_map, list);
-			list_del_init(&em->list);
-			free_extent_map(em);
-		}
 		/*
 		 * If any block groups are found in ->deleted_bgs then it's
 		 * because the transaction was aborted and a commit did not
@@ -235,7 +227,6 @@  static noinline int join_transaction(struct btrfs_fs_info *fs_info,
 	spin_lock_init(&cur_trans->delayed_refs.lock);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->dev_update_list);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
 	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2bd76f681520..4419a4a0294b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -51,7 +51,6 @@  struct btrfs_transaction {
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
-	struct list_head pending_chunks;
 	struct list_head dev_update_list;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39a107bdde06..a06f1e3c63b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -335,6 +335,8 @@  void btrfs_free_device(struct btrfs_device *device)
 {
 	WARN_ON(!list_empty(&device->post_commit_list));
 	rcu_string_free(device->name);
+	if (!in_softirq())
+		extent_io_tree_release(&device->alloc_state);
 	bio_put(device->flush_bio);
 	kfree(device);
 }
@@ -411,6 +413,7 @@  static struct btrfs_device *__alloc_device(void)
 	btrfs_device_data_ordered_init(dev);
 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
 
 	return dev;
 }
@@ -1269,6 +1272,9 @@  static void btrfs_close_one_device(struct btrfs_device *device)
 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		fs_devices->missing_devices--;
 
+	/* Remove alloc state now since it cannot be done in RCU context */
+	extent_io_tree_release(&device->alloc_state);
+
 	btrfs_close_bdev(device);
 
 	new_device = btrfs_alloc_device(NULL, &device->devid,
@@ -1505,58 +1511,29 @@  struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
 	return device;
 }
 
-static int contains_pending_extent(struct btrfs_transaction *transaction,
-				   struct btrfs_device *device,
-				   u64 *start, u64 len)
-{
-	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct extent_map *em;
-	struct list_head *search_list = &fs_info->pinned_chunks;
-	int ret = 0;
-	u64 physical_start = *start;
-
-	if (transaction)
-		search_list = &transaction->pending_chunks;
-again:
-	list_for_each_entry(em, search_list, list) {
-		struct map_lookup *map;
-		int i;
-
-		map = em->map_lookup;
-		for (i = 0; i < map->num_stripes; i++) {
-			u64 end;
-
-			if (map->stripes[i].dev != device)
-				continue;
-			if (map->stripes[i].physical >= physical_start + len ||
-			    map->stripes[i].physical + em->orig_block_len <=
-			    physical_start)
-				continue;
-			/*
-			 * Make sure that while processing the pinned list we do
-			 * not override our *start with a lower value, because
-			 * we can have pinned chunks that fall within this
-			 * device hole and that have lower physical addresses
-			 * than the pending chunks we processed before. If we
-			 * do not take this special care we can end up getting
-			 * 2 pending chunks that start at the same physical
-			 * device offsets because the end offset of a pinned
-			 * chunk can be equal to the start offset of some
-			 * pending chunk.
-			 */
-			end = map->stripes[i].physical + em->orig_block_len;
-			if (end > *start) {
-				*start = end;
-				ret = 1;
-			}
+/*
+ * Tries to find a chunk that intersects [start, start +len] range and when one
+ * such is found, records the end of it in *start
+ */
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
+				    u64 len)
+{
+	u64 physical_start, physical_end;
+	lockdep_assert_held(&device->fs_info->chunk_mutex);
+
+	if (!find_first_extent_bit(&device->alloc_state, *start,
+				   &physical_start, &physical_end,
+				   CHUNK_ALLOCATED, NULL)) {
+
+		if (in_range(physical_start, *start, len) ||
+		    in_range(*start, physical_start,
+			     physical_end - physical_start)) {
+			*start = physical_end + 1;
+			return true;
 		}
 	}
-	if (search_list != &fs_info->pinned_chunks) {
-		search_list = &fs_info->pinned_chunks;
-		goto again;
-	}
-
-	return ret;
+	return false;
 }
 
 
@@ -1667,15 +1644,12 @@  int find_free_dev_extent_start(struct btrfs_transaction *transaction,
 			 * Have to check before we set max_hole_start, otherwise
 			 * we could end up sending back this offset anyway.
 			 */
-			if (contains_pending_extent(transaction, device,
-						    &search_start,
+			if (contains_pending_extent(device, &search_start,
 						    hole_size)) {
-				if (key.offset >= search_start) {
+				if (key.offset >= search_start)
 					hole_size = key.offset - search_start;
-				} else {
-					WARN_ON_ONCE(1);
+				else
 					hole_size = 0;
-				}
 			}
 
 			if (hole_size > max_hole_size) {
@@ -1716,8 +1690,7 @@  int find_free_dev_extent_start(struct btrfs_transaction *transaction,
 	if (search_end > search_start) {
 		hole_size = search_end - search_start;
 
-		if (contains_pending_extent(transaction, device, &search_start,
-					    hole_size)) {
+		if (contains_pending_extent(device, &search_start, hole_size)) {
 			btrfs_release_path(path);
 			goto again;
 		}
@@ -4759,7 +4732,7 @@  int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	 * in-memory chunks are synced to disk so that the loop below sees them
 	 * and relocates them accordingly.
 	 */
-	if (contains_pending_extent(trans->transaction, device, &start, diff)) {
+	if (contains_pending_extent(device, &start, diff)) {
 		mutex_unlock(&fs_info->chunk_mutex);
 		ret = btrfs_commit_transaction(trans);
 		if (ret)
@@ -5201,9 +5174,6 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		free_extent_map(em);
 		goto error;
 	}
-
-	list_add_tail(&em->list, &trans->transaction->pending_chunks);
-	refcount_inc(&em->refs);
 	write_unlock(&em_tree->lock);
 
 	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
@@ -5236,8 +5206,6 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	free_extent_map(em);
 	/* One for the tree reference */
 	free_extent_map(em);
-	/* One for the pending_chunks list reference */
-	free_extent_map(em);
 error:
 	kfree(devices_info);
 	return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a0f09aad3770..49dc737d8a54 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -134,6 +134,8 @@  struct btrfs_device {
 	/* Counter to record the change of device stats */
 	atomic_t dev_stats_ccnt;
 	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+
+	struct extent_io_tree alloc_state;
 };
 
 /*