Btrfs: use per-cpu pools for reserving metadata space
diff mbox

Message ID 20100111164220.GA2360@localhost.localdomain
State New, archived
Headers show

Commit Message

Josef Bacik Jan. 11, 2010, 4:42 p.m. UTC
None

Patch
diff mbox

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8c57180..1a4014b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -666,6 +666,13 @@  struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+struct btrfs_reserved_space_pool {
+	u64 total_bytes;
+	u64 reserved_bytes;
+	u64 used_bytes;
+	spinlock_t lock;
+};
+
 struct btrfs_space_info {
 	u64 flags;
 
@@ -688,8 +695,6 @@  struct btrfs_space_info {
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
 				   this space */
-	int force_delalloc;	/* make people start doing filemap_flush until
-				   we're under a threshold */
 
 	struct list_head list;
 
@@ -980,6 +985,7 @@  struct btrfs_fs_info {
 	unsigned metadata_ratio;
 
 	void *bdev_holder;
+	struct btrfs_reserved_space_pool *reserved_space_pool;
 };
 
 /*
@@ -2051,6 +2057,7 @@  void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 				 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
+void btrfs_init_space_pools(struct btrfs_fs_info *fs_info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02b6afb..d02a6ea 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1575,6 +1575,7 @@  struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_root *log_tree_root;
 
 	int ret;
+	int i;
 	int err = -EINVAL;
 
 	struct btrfs_super_block *disk_super;
@@ -1917,8 +1918,23 @@  struct btrfs_root *open_ctree(struct super_block *sb,
 
 	csum_root->track_dirty = 1;
 
+	fs_info->reserved_space_pool =
+		alloc_percpu(struct btrfs_reserved_space_pool);
+	if (!fs_info->reserved_space_pool)
+		goto fail_csum_root;
+
+	for_each_possible_cpu(i) {
+		struct btrfs_reserved_space_pool *pool;
+		pool = per_cpu_ptr(fs_info->reserved_space_pool, i);
+		spin_lock_init(&pool->lock);
+		pool->total_bytes = 0;
+		pool->reserved_bytes = 0;
+		pool->used_bytes = 0;
+	}
+
 	btrfs_read_block_groups(extent_root);
 
+	btrfs_init_space_pools(fs_info);
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 	fs_info->data_alloc_profile = (u64)-1;
@@ -2442,6 +2458,7 @@  int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
+	free_percpu(fs_info->reserved_space_pool);
 
 	del_fs_roots(fs_info);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c2f3cee..05eac97 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2660,6 +2660,7 @@  static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		found->full = 0;
 		spin_unlock(&found->lock);
 		*space_info = found;
+		btrfs_init_space_pools(info);
 		return 0;
 	}
 	found = kzalloc(sizeof(*found), GFP_NOFS);
@@ -2667,6 +2668,7 @@  static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&found->block_groups);
+	init_waitqueue_head(&found->flush_wait);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
 	found->flags = flags;
@@ -2681,6 +2683,10 @@  static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
 	atomic_set(&found->caching_threads, 0);
+
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		btrfs_init_space_pools(info);
+
 	return 0;
 }
 
@@ -2815,63 +2821,20 @@  static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
 					  struct inode *inode, int num_items)
 {
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
-	u64 num_bytes;
-	u64 alloc_target;
-	bool bug = false;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
-
-	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-					   num_items);
-
-	spin_lock(&meta_sinfo->lock);
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	if (BTRFS_I(inode)->reserved_extents <=
 	    BTRFS_I(inode)->outstanding_extents) {
 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		spin_unlock(&meta_sinfo->lock);
 		return 0;
 	}
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
 	BTRFS_I(inode)->reserved_extents--;
-	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-
-	if (meta_sinfo->bytes_delalloc < num_bytes) {
-		bug = true;
-		meta_sinfo->bytes_delalloc = 0;
-	} else {
-		meta_sinfo->bytes_delalloc -= num_bytes;
-	}
-	spin_unlock(&meta_sinfo->lock);
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
-	BUG_ON(bug);
+	btrfs_unreserve_metadata_space(root, num_items);
 
 	return 0;
 }
 
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
-{
-	u64 thresh;
-
-	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use;
-
-	thresh = meta_sinfo->total_bytes - thresh;
-	thresh *= 80;
-	do_div(thresh, 100);
-	if (thresh <= meta_sinfo->bytes_delalloc)
-		meta_sinfo->force_delalloc = 1;
-	else
-		meta_sinfo->force_delalloc = 0;
-}
-
 struct async_flush {
 	struct btrfs_root *root;
 	struct btrfs_space_info *info;
@@ -2900,10 +2863,18 @@  static noinline void flush_delalloc_async(struct btrfs_work *work)
 	kfree(async);
 }
 
-static void wait_on_flush(struct btrfs_space_info *info)
+static void wait_on_flush(struct btrfs_root *root, struct btrfs_space_info *info)
 {
 	DEFINE_WAIT(wait);
-	u64 used;
+	u64 num_bytes;
+	u64 free;
+	int i;
+
+	/*
+	 * Number of CPU's * the maximum number of reservations that anybody
+	 * would ever want to use
+	 */
+	num_bytes = calculate_bytes_needed(root, nr_cpu_ids * 5);
 
 	while (1) {
 		prepare_to_wait(&info->flush_wait, &wait,
@@ -2914,14 +2885,28 @@  static void wait_on_flush(struct btrfs_space_info *info)
 			break;
 		}
 
-		used = info->bytes_used + info->bytes_reserved +
-			info->bytes_pinned + info->bytes_readonly +
-			info->bytes_super + info->bytes_root +
-			info->bytes_may_use + info->bytes_delalloc;
-		if (used < info->total_bytes) {
+		free = 0;
+		for_each_possible_cpu(i) {
+			struct btrfs_reserved_space_pool *pool;
+			pool = per_cpu_ptr(root->fs_info->reserved_space_pool, i);
+			spin_lock(&pool->lock);
+			if (pool->used_bytes + pool->reserved_bytes >=
+			    pool->total_bytes) {
+				spin_unlock(&pool->lock);
+				continue;
+			}
+			free += pool->total_bytes - pool->used_bytes -
+				pool->reserved_bytes;
+			spin_unlock(&pool->lock);
+			if (free > num_bytes)
+				break;
+		}
+
+		if (free > num_bytes) {
 			spin_unlock(&info->lock);
 			break;
 		}
+
 		spin_unlock(&info->lock);
 		schedule();
 	}
@@ -2946,7 +2931,7 @@  static void flush_delalloc(struct btrfs_root *root,
 	spin_unlock(&info->lock);
 
 	if (wait) {
-		wait_on_flush(info);
+		wait_on_flush(root, info);
 		return;
 	}
 
@@ -2960,7 +2945,7 @@  static void flush_delalloc(struct btrfs_root *root,
 
 	btrfs_queue_worker(&root->fs_info->enospc_workers,
 			   &async->work);
-	wait_on_flush(info);
+	wait_on_flush(root, info);
 	return;
 
 flush:
@@ -2990,6 +2975,7 @@  static int maybe_allocate_chunk(struct btrfs_root *root,
 	 */
 	min_metadata = min((u64)10 * 1024 * 1024 * 1024,
 			     div64_u64(free_space * 5, 100));
+	spin_lock(&info->lock);
 	if (info->total_bytes >= min_metadata) {
 		spin_unlock(&info->lock);
 		return 0;
@@ -3026,8 +3012,6 @@  static int maybe_allocate_chunk(struct btrfs_root *root,
 			     4096 + 2 * 1024 * 1024,
 			     info->flags, 0);
 	btrfs_end_transaction(trans, root);
-	if (ret)
-		goto out;
 out:
 	spin_lock(&info->lock);
 	info->allocating_chunk = 0;
@@ -3045,72 +3029,135 @@  out:
 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
 					struct inode *inode, int num_items)
 {
+	struct btrfs_reserved_space_pool *pool;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
+	struct btrfs_space_info *meta_sinfo = NULL;
+	bool chunk_allocated = false;
+	bool delalloc_flushed = false;
+	bool inode_flushed = false;
+	u64 realloc_bytes = 0;
 	u64 num_bytes;
-	u64 used;
 	u64 alloc_target;
-	int flushed = 0;
-	int force_delalloc;
+	int retries = 0;
+	int i;
 
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
+	num_bytes = calculate_bytes_needed(root, num_items);
+
+	pool = per_cpu_ptr(info->reserved_space_pool,
+			   raw_smp_processor_id());
 
-	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-					   num_items);
 again:
-	spin_lock(&meta_sinfo->lock);
+	spin_lock(&pool->lock);
 
-	force_delalloc = meta_sinfo->force_delalloc;
+	if (realloc_bytes >= num_bytes) {
+		pool->total_bytes += realloc_bytes;
+		spin_lock(&BTRFS_I(inode)->accounting_lock);
+		BTRFS_I(inode)->reserved_extents++;
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+		spin_unlock(&pool->lock);
+		return 0;
+	}
 
-	if (unlikely(!meta_sinfo->bytes_root))
-		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+	if (!retries)
+		pool->reserved_bytes += num_bytes;
 
-	if (!flushed)
-		meta_sinfo->bytes_delalloc += num_bytes;
+	/*
+	 * Fast path, we have plent of space in this pool to use, go ahead and
+	 * use it and move on.
+	 */
+	if (pool->reserved_bytes + pool->used_bytes <= pool->total_bytes) {
+		spin_lock(&BTRFS_I(inode)->accounting_lock);
+		BTRFS_I(inode)->reserved_extents++;
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+		spin_unlock(&pool->lock);
+		return 0;
+	}
 
-	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+	retries++;
+	spin_unlock(&pool->lock);
 
-	if (used > meta_sinfo->total_bytes) {
-		flushed++;
+	/*
+	 * Ok didn't find anything, try and steal from somebody elses pool.
+	 */
+	for_each_possible_cpu(i) {
+		struct btrfs_reserved_space_pool *tmp_pool;
+		u64 free_bytes;
 
-		if (flushed == 1) {
-			if (maybe_allocate_chunk(root, meta_sinfo))
-				goto again;
-			flushed++;
-		} else {
-			spin_unlock(&meta_sinfo->lock);
+		tmp_pool = per_cpu_ptr(info->reserved_space_pool, i);
+		if (pool == tmp_pool)
+			continue;
+
+		spin_lock(&tmp_pool->lock);
+
+		if (tmp_pool->reserved_bytes + tmp_pool->used_bytes >=
+		    tmp_pool->total_bytes) {
+			spin_unlock(&tmp_pool->lock);
+			continue;
 		}
 
-		if (flushed == 2) {
-			filemap_flush(inode->i_mapping);
-			goto again;
-		} else if (flushed == 3) {
-			flush_delalloc(root, meta_sinfo);
+		free_bytes = tmp_pool->total_bytes - tmp_pool->used_bytes -
+			tmp_pool->reserved_bytes;
+
+		/*
+		 * If this pool has reserved bytes, but still has alot of free
+		 * space, only take half of the free space.  The idea here is
+		 * that
+		 *
+		 * 1) If only one processor is doing the work then the others
+		 * won't have alot of reserved bytes, and we can steal all of
+		 * their free space.
+		 *
+		 * 2) If all the processors are doing work, then we don't want
+		 * to steal a whole lot from them, but on the other hand we
+		 * don't want to have to keep stealing small amounts from
+		 * everybody, so take half the space and hope that this
+		 * processor will be back to use more space.
+		 */
+		if (tmp_pool->reserved_bytes > num_bytes &&
+		    num_bytes < free_bytes && num_bytes <= (free_bytes >> 1))
+			free_bytes = free_bytes >> 1;
+
+		realloc_bytes += free_bytes;
+		tmp_pool->total_bytes -= free_bytes;
+		spin_unlock(&tmp_pool->lock);
+
+		if (num_bytes <= realloc_bytes);
 			goto again;
-		}
-		spin_lock(&meta_sinfo->lock);
-		meta_sinfo->bytes_delalloc -= num_bytes;
-		spin_unlock(&meta_sinfo->lock);
-		printk(KERN_ERR "enospc, has %d, reserved %d\n",
-		       BTRFS_I(inode)->outstanding_extents,
-		       BTRFS_I(inode)->reserved_extents);
-		dump_space_info(meta_sinfo, 0, 0);
-		return -ENOSPC;
 	}
 
-	BTRFS_I(inode)->reserved_extents++;
-	check_force_delalloc(meta_sinfo);
-	spin_unlock(&meta_sinfo->lock);
-
-	if (!flushed && force_delalloc)
+	if (!inode_flushed) {
+		inode_flushed = true;
 		filemap_flush(inode->i_mapping);
+		goto again;
+	}
 
-	return 0;
+	if (!meta_sinfo) {
+		/* get the space info for where the metadata will live */
+		alloc_target = btrfs_get_alloc_profile(root, 0);
+		meta_sinfo = __find_space_info(info, alloc_target);
+	}
+
+	if (!delalloc_flushed) {
+		delalloc_flushed = true;
+		flush_delalloc(root, meta_sinfo);
+		goto again;
+	}
+
+	if (!chunk_allocated) {
+		chunk_allocated = true;
+		btrfs_wait_ordered_extents(root, 0);
+		maybe_allocate_chunk(root, meta_sinfo);
+		goto again;
+	}
+
+	spin_lock(&pool->lock);
+	pool->reserved_bytes -= calculate_bytes_needed(root, num_items);
+	if (realloc_bytes)
+		pool->total_bytes += realloc_bytes;
+	spin_unlock(&pool->lock);
+
+	printk(KERN_ERR "delalloc reserve ran out of space!!!!\n");
+	return -ENOSPC;
 }
 
 /*
@@ -3124,28 +3171,54 @@  again:
  */
 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
 {
+	struct btrfs_reserved_space_pool *pool;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
+	struct btrfs_space_info *meta_sinfo = NULL;
 	u64 num_bytes;
-	u64 alloc_target;
-	bool bug = false;
+	u64 alloc_target = btrfs_get_alloc_profile(root, 0);
+	int i;
 
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
+	num_bytes = calculate_bytes_needed(root, num_items);
+
+	pool = per_cpu_ptr(info->reserved_space_pool, raw_smp_processor_id());
 	meta_sinfo = __find_space_info(info, alloc_target);
 
-	num_bytes = calculate_bytes_needed(root, num_items);
+	spin_lock(&pool->lock);
+	if (num_bytes <= pool->reserved_bytes) {
+		pool->reserved_bytes -= num_bytes;
+		spin_unlock(&pool->lock);
+		if (waitqueue_active(&meta_sinfo->flush_wait))
+			wake_up(&meta_sinfo->flush_wait);
+		return 0;
+	}
 
-	spin_lock(&meta_sinfo->lock);
-	if (meta_sinfo->bytes_may_use < num_bytes) {
-		bug = true;
-		meta_sinfo->bytes_may_use = 0;
-	} else {
-		meta_sinfo->bytes_may_use -= num_bytes;
+	num_bytes -= pool->reserved_bytes;
+	pool->reserved_bytes = 0;
+	spin_unlock(&pool->lock);
+
+	/*
+	 * Ok we could have moved processors in between the reservation and
+	 * here, so lets just take the reserved space away from the first pool
+	 * we find.
+	 */
+	for_each_possible_cpu(i) {
+		pool = per_cpu_ptr(info->reserved_space_pool, i);
+		spin_lock(&pool->lock);
+		if (num_bytes <= pool->reserved_bytes) {
+			pool->reserved_bytes -= num_bytes;
+			spin_unlock(&pool->lock);
+			return 0;
+		}
+
+		num_bytes -= pool->reserved_bytes;
+		pool->reserved_bytes = 0;
+		spin_unlock(&pool->lock);
 	}
-	spin_unlock(&meta_sinfo->lock);
 
-	BUG_ON(bug);
+	if (waitqueue_active(&meta_sinfo->flush_wait))
+		wake_up(&meta_sinfo->flush_wait);
+
+	WARN_ON(num_bytes);
 
 	return 0;
 }
@@ -3165,58 +3238,220 @@  int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
  */
 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
 {
+	struct btrfs_reserved_space_pool *pool;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
+	struct btrfs_space_info *meta_sinfo = NULL;
+	bool chunk_allocated = false;
+	bool delalloc_flushed = false;
+	bool committed = false;
+	u64 realloc_bytes = 0;
 	u64 num_bytes;
-	u64 used;
 	u64 alloc_target;
 	int retries = 0;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
+	int i;
 
 	num_bytes = calculate_bytes_needed(root, num_items);
+
+	pool = per_cpu_ptr(info->reserved_space_pool, raw_smp_processor_id());
+
 again:
-	spin_lock(&meta_sinfo->lock);
+	spin_lock(&pool->lock);
 
-	if (unlikely(!meta_sinfo->bytes_root))
-		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+	/*
+	 * If we've managed to acquire enough bytes from other pools then add it
+	 * to our total bytes and exit.
+	 */
+	if (realloc_bytes >= num_bytes) {
+		pool->total_bytes += realloc_bytes;
+		spin_unlock(&pool->lock);
+		return 0;
+	}
 
 	if (!retries)
-		meta_sinfo->bytes_may_use += num_bytes;
+		pool->reserved_bytes += num_bytes;
 
-	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+	/*
+	 * Fast path, we have plent of space in this pool to use, go ahead and
+	 * use it and move on.
+	 */
+	if (pool->reserved_bytes + pool->used_bytes <= pool->total_bytes) {
+		spin_unlock(&pool->lock);
+		return 0;
+	}
 
-	if (used > meta_sinfo->total_bytes) {
-		retries++;
-		if (retries == 1) {
-			if (maybe_allocate_chunk(root, meta_sinfo))
-				goto again;
-			retries++;
-		} else {
-			spin_unlock(&meta_sinfo->lock);
+	retries++;
+	spin_unlock(&pool->lock);
+
+	/*
+	 * Ok don't have enough space, try and steal from somebody elses pool.
+	 */
+	for_each_possible_cpu(i) {
+		struct btrfs_reserved_space_pool *tmp_pool;
+		u64 free_bytes;
+
+		tmp_pool = per_cpu_ptr(info->reserved_space_pool, i);
+		if (tmp_pool == pool)
+			continue;
+
+		spin_lock(&tmp_pool->lock);
+
+		if (tmp_pool->reserved_bytes + tmp_pool->used_bytes >=
+		    tmp_pool->total_bytes) {
+			spin_unlock(&tmp_pool->lock);
+			continue;
 		}
 
-		if (retries == 2) {
-			flush_delalloc(root, meta_sinfo);
+		free_bytes = tmp_pool->total_bytes - tmp_pool->used_bytes -
+			tmp_pool->reserved_bytes;
+
+		/* Only take 1/2 of the free space if its more than enough */
+		if (tmp_pool->reserved_bytes > num_bytes &&
+		    num_bytes < free_bytes && num_bytes <= (free_bytes >> 1))
+			free_bytes = free_bytes >> 1;
+
+		realloc_bytes += free_bytes;
+		tmp_pool->total_bytes -= free_bytes;
+		spin_unlock(&tmp_pool->lock);
+
+		if (num_bytes <= realloc_bytes)
 			goto again;
-		}
-		spin_lock(&meta_sinfo->lock);
-		meta_sinfo->bytes_may_use -= num_bytes;
-		spin_unlock(&meta_sinfo->lock);
+	}
 
-		dump_space_info(meta_sinfo, 0, 0);
-		return -ENOSPC;
+	if (!meta_sinfo) {
+		/* get the space info for where the metadata will live */
+		alloc_target = btrfs_get_alloc_profile(root, 0);
+		meta_sinfo = __find_space_info(info, alloc_target);
 	}
 
-	check_force_delalloc(meta_sinfo);
+	if (!chunk_allocated) {
+		chunk_allocated = true;
+		if (maybe_allocate_chunk(root, meta_sinfo))
+			goto again;
+	}
+
+	if (!delalloc_flushed) {
+		delalloc_flushed = true;
+		flush_delalloc(root, meta_sinfo);
+		goto again;
+	}
+
+	if (!committed && !current->journal_info) {
+		struct btrfs_trans_handle *trans;
+		committed = true;
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_commit_transaction(trans, root);
+		goto again;
+	}
+
+	/* Oh well, we couldn't beg/borrow/steal enough space, just exit. */
+	spin_lock(&pool->lock);
+	pool->reserved_bytes -= num_bytes;
+	if (realloc_bytes)
+		pool->total_bytes += realloc_bytes;
+	spin_unlock(&pool->lock);
+
+	return -ENOSPC;
+}
+
+void btrfs_init_space_pools(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *meta_sinfo = NULL;
+	struct btrfs_reserved_space_pool *pool;
+	u64 total;
+	u64 per_pool;
+	u64 used;
+	u64 alloc_target;
+	int i;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(fs_info->extent_root, 0);
+	meta_sinfo = __find_space_info(fs_info, alloc_target);
+
+	/*
+	 * This can happen during mount where we haven't quite set everything up
+	 * yet.
+	 */
+	if (!meta_sinfo)
+		return;
+
+	spin_lock(&meta_sinfo->lock);
+
+	if (unlikely(!meta_sinfo->bytes_root))
+		meta_sinfo->bytes_root =
+			calculate_bytes_needed(fs_info->extent_root, 6);
+
+	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+	/*
+	 * Only use 80% of the free metadata space for reservation, so we have
+	 * some spill-over room.
+	 */
+	total = meta_sinfo->total_bytes - used;
 	spin_unlock(&meta_sinfo->lock);
+	total *= 80;
+	total = div64_u64(total, 100);
 
-	return 0;
+	per_pool = div64_u64(total, nr_cpu_ids);
+	for_each_possible_cpu(i) {
+		pool = per_cpu_ptr(fs_info->reserved_space_pool, i);
+		spin_lock(&pool->lock);
+		pool->used_bytes = 0;
+
+		/*
+		 * Ok the idea here is that we want to skew the spreading of the
+		 * available space based on how it's being used across the
+		 * processors.  So here's how this works
+		 *
+		 * 1) if the total number of bytes we have is more than this
+		 * pool has reserved, and this pool has reserved bytes, just
+		 * give it the number of reserved bytes it has.
+		 *
+		 * 2) if the pool has no reserved bytes, give it the per_pool
+		 * amount.  You could just give it 0, and in some cases it works
+		 * fine (single threaded cases), and in some cases it doesn't
+		 * (multi-threaded cases).  Giving it 0 versus not in the single
+		 * threaded case doesn't make a difference, so give it hte per
+		 * pool.
+		 *
+		 * 3) if total is less than the per pool amount, just give the
+		 * pool the rest of the space.
+		 */
+		if (total >= pool->reserved_bytes) {
+			if (pool->reserved_bytes) {
+				pool->total_bytes = pool->reserved_bytes;
+				total -= pool->reserved_bytes;
+			} else if (total >= per_pool) {
+				pool->total_bytes = per_pool;
+				total -= per_pool;
+			} else {
+				pool->total_bytes = total;
+				total = 0;
+			}
+		} else {
+			if (total >= per_pool) {
+				pool->total_bytes = per_pool;
+				total -= per_pool;
+			} else {
+				pool->total_bytes = total;
+				total = 0;
+			}
+		}
+		spin_unlock(&pool->lock);
+	}
+
+	/*
+	 * If there's any space left over, just give it to the guy that we're
+	 * currently on, since we're likely to be doing work soon anyway.
+	 */
+	if (total) {
+		pool = per_cpu_ptr(fs_info->reserved_space_pool, raw_smp_processor_id());
+		spin_lock(&pool->lock);
+		pool->total_bytes += total;
+		spin_unlock(&pool->lock);
+	}
 }
 
 /*
@@ -4626,6 +4861,7 @@  again:
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
+	struct btrfs_reserved_space_pool *pool;
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
 
@@ -4642,6 +4878,30 @@  int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	update_reserved_extents(cache, len, 0);
 	btrfs_put_block_group(cache);
 
+	pool = per_cpu_ptr(root->fs_info->reserved_space_pool,
+			   raw_smp_processor_id());
+	spin_lock(&pool->lock);
+	if (pool->used_bytes < len) {
+		int i;
+		spin_unlock(&pool->lock);
+		for_each_possible_cpu(i) {
+			if (i == raw_smp_processor_id())
+				continue;
+			pool = per_cpu_ptr(root->fs_info->reserved_space_pool,
+					   i);
+			spin_lock(&pool->lock);
+			if (pool->used_bytes >= len) {
+				pool->used_bytes -= len;
+				spin_unlock(&pool->lock);
+				break;
+			}
+			spin_unlock(&pool->lock);
+		}
+	} else {
+		pool->used_bytes -= len;
+		spin_unlock(&pool->lock);
+	}
+
 	return ret;
 }
 
@@ -4939,6 +5199,7 @@  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_disk_key *key, int level,
 					u64 hint, u64 empty_size)
 {
+	struct btrfs_reserved_space_pool *pool;
 	struct btrfs_key ins;
 	int ret;
 	struct extent_buffer *buf;
@@ -4950,6 +5211,12 @@  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
+	pool = per_cpu_ptr(root->fs_info->reserved_space_pool,
+			   raw_smp_processor_id());
+	spin_lock(&pool->lock);
+	pool->used_bytes += ins.offset;
+	spin_unlock(&pool->lock);
+
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
 				    blocksize, level);
 	return buf;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b383e53..b5a36b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1340,6 +1340,7 @@  static int btrfs_clear_bit_hook(struct inode *inode,
 
 		if (bits & EXTENT_DO_ACCOUNTING) {
 			spin_lock(&BTRFS_I(inode)->accounting_lock);
+			BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 			BTRFS_I(inode)->outstanding_extents--;
 			spin_unlock(&BTRFS_I(inode)->accounting_lock);
 			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5799bc4..031dcc5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -307,6 +307,7 @@  int btrfs_remove_ordered_extent(struct inode *inode,
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c207e8c..37f755a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1056,6 +1056,8 @@  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_prepare_extent_commit(trans, root);
 
+	btrfs_init_space_pools(root->fs_info);
+
 	cur_trans = root->fs_info->running_transaction;
 	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;