From patchwork Thu Feb 25 20:44:45 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Josef Bacik X-Patchwork-Id: 82162 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o1PKjApG030043 for ; Thu, 25 Feb 2010 20:45:10 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933703Ab0BYUoy (ORCPT ); Thu, 25 Feb 2010 15:44:54 -0500 Received: from mx1.redhat.com ([209.132.183.28]:7298 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933698Ab0BYUow (ORCPT ); Thu, 25 Feb 2010 15:44:52 -0500 Received: from int-mx04.intmail.prod.int.phx2.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.17]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o1PKimBR031047 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Thu, 25 Feb 2010 15:44:49 -0500 Received: from localhost.localdomain (vpn-10-212.rdu.redhat.com [10.11.10.212]) by int-mx04.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id o1PKikcZ012092; Thu, 25 Feb 2010 15:44:46 -0500 Date: Thu, 25 Feb 2010 15:44:45 -0500 From: Josef Bacik To: Shaohua Li Cc: chris.mason@oracle.com, linux-btrfs@vger.kernel.org Subject: Re: [RFC] remove delalloc accounting for __btrfs_remove_ordered_extent Message-ID: <20100225204444.GE10960@localhost.localdomain> References: <20100224071205.GA14237@sli10-desk.sh.intel.com> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20100224071205.GA14237@sli10-desk.sh.intel.com> User-Agent: Mutt/1.5.19 (2009-01-05) X-Scanned-By: MIMEDefang 2.67 on 10.5.11.17 Sender: linux-btrfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-btrfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Thu, 25 Feb 2010 20:45:10 +0000 (UTC) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0f2db97..5c2b9cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -667,6 +667,13 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_reserved_space_pool { + u64 total_bytes; + u64 reserved_bytes; + u64 used_bytes; + spinlock_t lock; +}; + struct btrfs_space_info { u64 flags; @@ -689,8 +696,6 @@ struct btrfs_space_info { chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ - int force_delalloc; /* make people start doing filemap_flush until - we're under a threshold */ struct list_head list; @@ -985,6 +990,7 @@ struct btrfs_fs_info { unsigned metadata_ratio; void *bdev_holder; + struct btrfs_reserved_space_pool *reserved_space_pool; }; /* @@ -2049,9 +2055,9 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); + struct inode *inode, u64 bytes); int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); + struct inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, @@ -2060,6 +2066,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_init_space_pools(struct btrfs_fs_info *fs_info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 94debaf..3d0007f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1521,6 +1521,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_root *log_tree_root; int ret; + int i; int err = -EINVAL; struct btrfs_super_block *disk_super; @@ -1866,8 +1867,23 @@ struct btrfs_root *open_ctree(struct super_block *sb, csum_root->track_dirty = 1; + fs_info->reserved_space_pool = + alloc_percpu(struct btrfs_reserved_space_pool); + if (!fs_info->reserved_space_pool) + goto fail_csum_root; + + for_each_possible_cpu(i) { + struct btrfs_reserved_space_pool *pool; + pool = per_cpu_ptr(fs_info->reserved_space_pool, i); + spin_lock_init(&pool->lock); + pool->total_bytes = 0; + pool->reserved_bytes = 0; + pool->used_bytes = 0; + } + btrfs_read_block_groups(extent_root); + btrfs_init_space_pools(fs_info); fs_info->generation = generation; fs_info->last_trans_committed = generation; fs_info->data_alloc_profile = (u64)-1; @@ -2403,6 +2419,7 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); + free_percpu(fs_info->reserved_space_pool); del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 12a2d23..32b409d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2665,6 +2665,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; spin_unlock(&found->lock); *space_info = found; + btrfs_init_space_pools(info); return 0; } found = kzalloc(sizeof(*found), GFP_NOFS); @@ -2672,6 +2673,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return -ENOMEM; INIT_LIST_HEAD(&found->block_groups); + init_waitqueue_head(&found->flush_wait); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags; @@ -2686,6 +2688,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, *space_info = found; list_add_rcu(&found->list, &info->space_info); atomic_set(&found->caching_threads, 0); + + if (flags & BTRFS_BLOCK_GROUP_METADATA) + btrfs_init_space_pools(info); + return 0; } @@ -2818,65 +2824,27 @@ static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) * we have extents, this function does nothing. */ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) + struct inode *inode, u64 bytes) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + int num_items; - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); + num_items = (int)div64_u64(bytes + root->fs_info->max_extent - 1, + root->fs_info->max_extent); - spin_lock(&meta_sinfo->lock); spin_lock(&BTRFS_I(inode)->accounting_lock); if (BTRFS_I(inode)->reserved_extents <= BTRFS_I(inode)->outstanding_extents) { spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); return 0; } + BTRFS_I(inode)->reserved_extents -= num_items; spin_unlock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->reserved_extents--; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); - - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); + btrfs_unreserve_metadata_space(root, num_items); return 0; } -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) -{ - u64 thresh; - - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; - - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; -} - struct async_flush { struct btrfs_root *root; struct btrfs_space_info *info; @@ -2905,10 +2873,18 @@ static noinline void flush_delalloc_async(struct btrfs_work *work) kfree(async); } -static void wait_on_flush(struct btrfs_space_info *info) +static void wait_on_flush(struct btrfs_root *root, struct btrfs_space_info *info) { DEFINE_WAIT(wait); - u64 used; + u64 num_bytes; + u64 free; + int i; + + /* + * Number of CPU's * the maximum number of reservations that anybody + * would ever want to use + */ + num_bytes = calculate_bytes_needed(root, nr_cpu_ids * 5); while (1) { prepare_to_wait(&info->flush_wait, &wait, @@ -2919,14 +2895,28 @@ static void wait_on_flush(struct btrfs_space_info *info) break; } - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { + free = 0; + for_each_possible_cpu(i) { + struct btrfs_reserved_space_pool *pool; + pool = per_cpu_ptr(root->fs_info->reserved_space_pool, i); + spin_lock(&pool->lock); + if (pool->used_bytes + pool->reserved_bytes >= + pool->total_bytes) { + spin_unlock(&pool->lock); + continue; + } + free += pool->total_bytes - pool->used_bytes - + pool->reserved_bytes; + spin_unlock(&pool->lock); + if (free > num_bytes) + break; + } + + if (free > num_bytes) { spin_unlock(&info->lock); break; } + spin_unlock(&info->lock); schedule(); } @@ -2951,7 +2941,7 @@ static void flush_delalloc(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_on_flush(info); + wait_on_flush(root, info); return; } @@ -2965,7 +2955,7 @@ static void flush_delalloc(struct btrfs_root *root, btrfs_queue_worker(&root->fs_info->enospc_workers, &async->work); - wait_on_flush(info); + wait_on_flush(root, info); return; flush: @@ -2995,6 +2985,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, */ min_metadata = min((u64)10 * 1024 * 1024 * 1024, div64_u64(free_space * 5, 100)); + spin_lock(&info->lock); if (info->total_bytes >= min_metadata) { spin_unlock(&info->lock); return 0; @@ -3031,8 +3022,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root, 4096 + 2 * 1024 * 1024, info->flags, 0); btrfs_end_transaction(trans, root); - if (ret) - goto out; out: spin_lock(&info->lock); info->allocating_chunk = 0; @@ -3048,74 +3037,140 @@ out: * Reserve metadata space for delalloc. */ int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) + struct inode *inode, u64 bytes) { + struct btrfs_reserved_space_pool *pool; struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; + struct btrfs_space_info *meta_sinfo = NULL; + bool chunk_allocated = false; + bool delalloc_flushed = false; + bool inode_flushed = false; + u64 realloc_bytes = 0; u64 num_bytes; - u64 used; u64 alloc_target; - int flushed = 0; - int force_delalloc; + int num_items; + int retries = 0; + int i; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + num_items = (int)div64_u64(bytes + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + num_bytes = calculate_bytes_needed(root, num_items); + + pool = per_cpu_ptr(info->reserved_space_pool, + raw_smp_processor_id()); - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); again: - spin_lock(&meta_sinfo->lock); + spin_lock(&pool->lock); - force_delalloc = meta_sinfo->force_delalloc; + if (realloc_bytes >= num_bytes) { + pool->total_bytes += realloc_bytes; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->reserved_extents += num_items; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&pool->lock); + return 0; + } - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + if (!retries) + pool->reserved_bytes += num_bytes; - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; + /* + * Fast path, we have plent of space in this pool to use, go ahead and + * use it and move on. + */ + if (pool->reserved_bytes + pool->used_bytes <= pool->total_bytes) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->reserved_extents += num_items; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&pool->lock); + return 0; + } - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + retries++; + spin_unlock(&pool->lock); - if (used > meta_sinfo->total_bytes) { - flushed++; + /* + * Ok didn't find anything, try and steal from somebody elses pool. + */ + for_each_possible_cpu(i) { + struct btrfs_reserved_space_pool *tmp_pool; + u64 free_bytes; - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); + tmp_pool = per_cpu_ptr(info->reserved_space_pool, i); + if (pool == tmp_pool) + continue; + + spin_lock(&tmp_pool->lock); + + if (tmp_pool->reserved_bytes + tmp_pool->used_bytes >= + tmp_pool->total_bytes) { + spin_unlock(&tmp_pool->lock); + continue; } - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); + free_bytes = tmp_pool->total_bytes - tmp_pool->used_bytes - + tmp_pool->reserved_bytes; + + /* + * If this pool has reserved bytes, but still has alot of free + * space, only take half of the free space. The idea here is + * that + * + * 1) If only one processor is doing the work then the others + * won't have alot of reserved bytes, and we can steal all of + * their free space. + * + * 2) If all the processors are doing work, then we don't want + * to steal a whole lot from them, but on the other hand we + * don't want to have to keep stealing small amounts from + * everybody, so take half the space and hope that this + * processor will be back to use more space. + */ + if (tmp_pool->reserved_bytes > num_bytes && + num_bytes < free_bytes && num_bytes <= (free_bytes >> 1)) + free_bytes = free_bytes >> 1; + + realloc_bytes += free_bytes; + tmp_pool->total_bytes -= free_bytes; + spin_unlock(&tmp_pool->lock); + + if (num_bytes <= realloc_bytes); goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; } - BTRFS_I(inode)->reserved_extents++; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - if (!flushed && force_delalloc) + if (!inode_flushed) { + inode_flushed = true; filemap_flush(inode->i_mapping); + goto again; + } - return 0; + if (!meta_sinfo) { + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + } + + if (!delalloc_flushed) { + delalloc_flushed = true; + flush_delalloc(root, meta_sinfo); + goto again; + } + + if (!chunk_allocated) { + chunk_allocated = true; + btrfs_wait_ordered_extents(root, 0, 0); + maybe_allocate_chunk(root, meta_sinfo); + goto again; + } + + spin_lock(&pool->lock); + pool->reserved_bytes -= calculate_bytes_needed(root, num_items); + if (realloc_bytes) + pool->total_bytes += realloc_bytes; + spin_unlock(&pool->lock); + + printk(KERN_ERR "delalloc reserve ran out of space!!!!\n"); + return -ENOSPC; } /* @@ -3129,28 +3184,54 @@ again: */ int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) { + struct btrfs_reserved_space_pool *pool; struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; + struct btrfs_space_info *meta_sinfo = NULL; u64 num_bytes; - u64 alloc_target; - bool bug = false; + u64 alloc_target = btrfs_get_alloc_profile(root, 0); + int i; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); + num_bytes = calculate_bytes_needed(root, num_items); + + pool = per_cpu_ptr(info->reserved_space_pool, raw_smp_processor_id()); meta_sinfo = __find_space_info(info, alloc_target); - num_bytes = calculate_bytes_needed(root, num_items); + spin_lock(&pool->lock); + if (num_bytes <= pool->reserved_bytes) { + pool->reserved_bytes -= num_bytes; + spin_unlock(&pool->lock); + if (waitqueue_active(&meta_sinfo->flush_wait)) + wake_up(&meta_sinfo->flush_wait); + return 0; + } - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; + num_bytes -= pool->reserved_bytes; + pool->reserved_bytes = 0; + spin_unlock(&pool->lock); + + /* + * Ok we could have moved processors in between the reservation and + * here, so lets just take the reserved space away from the first pool + * we find. + */ + for_each_possible_cpu(i) { + pool = per_cpu_ptr(info->reserved_space_pool, i); + spin_lock(&pool->lock); + if (num_bytes <= pool->reserved_bytes) { + pool->reserved_bytes -= num_bytes; + spin_unlock(&pool->lock); + return 0; + } + + num_bytes -= pool->reserved_bytes; + pool->reserved_bytes = 0; + spin_unlock(&pool->lock); } - spin_unlock(&meta_sinfo->lock); - BUG_ON(bug); + if (waitqueue_active(&meta_sinfo->flush_wait)) + wake_up(&meta_sinfo->flush_wait); + + WARN_ON(num_bytes); return 0; } @@ -3170,58 +3251,220 @@ int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) */ int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) { + struct btrfs_reserved_space_pool *pool; struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; + struct btrfs_space_info *meta_sinfo = NULL; + bool chunk_allocated = false; + bool delalloc_flushed = false; + bool committed = false; + u64 realloc_bytes = 0; u64 num_bytes; - u64 used; u64 alloc_target; int retries = 0; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + int i; num_bytes = calculate_bytes_needed(root, num_items); + + pool = per_cpu_ptr(info->reserved_space_pool, raw_smp_processor_id()); + again: - spin_lock(&meta_sinfo->lock); + spin_lock(&pool->lock); - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + /* + * If we've managed to acquire enough bytes from other pools then add it + * to our total bytes and exit. + */ + if (realloc_bytes >= num_bytes) { + pool->total_bytes += realloc_bytes; + spin_unlock(&pool->lock); + return 0; + } if (!retries) - meta_sinfo->bytes_may_use += num_bytes; + pool->reserved_bytes += num_bytes; - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + /* + * Fast path, we have plent of space in this pool to use, go ahead and + * use it and move on. + */ + if (pool->reserved_bytes + pool->used_bytes <= pool->total_bytes) { + spin_unlock(&pool->lock); + return 0; + } - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); + retries++; + spin_unlock(&pool->lock); + + /* + * Ok don't have enough space, try and steal from somebody elses pool. + */ + for_each_possible_cpu(i) { + struct btrfs_reserved_space_pool *tmp_pool; + u64 free_bytes; + + tmp_pool = per_cpu_ptr(info->reserved_space_pool, i); + if (tmp_pool == pool) + continue; + + spin_lock(&tmp_pool->lock); + + if (tmp_pool->reserved_bytes + tmp_pool->used_bytes >= + tmp_pool->total_bytes) { + spin_unlock(&tmp_pool->lock); + continue; } - if (retries == 2) { - flush_delalloc(root, meta_sinfo); + free_bytes = tmp_pool->total_bytes - tmp_pool->used_bytes - + tmp_pool->reserved_bytes; + + /* Only take 1/2 of the free space if its more than enough */ + if (tmp_pool->reserved_bytes > num_bytes && + num_bytes < free_bytes && num_bytes <= (free_bytes >> 1)) + free_bytes = free_bytes >> 1; + + realloc_bytes += free_bytes; + tmp_pool->total_bytes -= free_bytes; + spin_unlock(&tmp_pool->lock); + + if (num_bytes <= realloc_bytes) goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); + } - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; + if (!meta_sinfo) { + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + } + + if (!chunk_allocated) { + chunk_allocated = true; + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + } + + if (!delalloc_flushed) { + delalloc_flushed = true; + flush_delalloc(root, meta_sinfo); + goto again; + } + + if (!committed && !current->journal_info) { + struct btrfs_trans_handle *trans; + committed = true; + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + goto again; } - check_force_delalloc(meta_sinfo); + /* Oh well, we couldn't beg/borrow/steal enough space, just exit. */ + spin_lock(&pool->lock); + pool->reserved_bytes -= num_bytes; + if (realloc_bytes) + pool->total_bytes += realloc_bytes; + spin_unlock(&pool->lock); + + return -ENOSPC; +} + +void btrfs_init_space_pools(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *meta_sinfo = NULL; + struct btrfs_reserved_space_pool *pool; + u64 total; + u64 per_pool; + u64 used; + u64 alloc_target; + int i; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(fs_info->extent_root, 0); + meta_sinfo = __find_space_info(fs_info, alloc_target); + + /* + * This can happen during mount where we haven't quite set everything up + * yet. + */ + if (!meta_sinfo) + return; + + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = + calculate_bytes_needed(fs_info->extent_root, 6); + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + /* + * Only use 80% of the free metadata space for reservation, so we have + * some spill-over room. + */ + total = meta_sinfo->total_bytes - used; spin_unlock(&meta_sinfo->lock); + total *= 80; + total = div64_u64(total, 100); - return 0; + per_pool = div64_u64(total, nr_cpu_ids); + for_each_possible_cpu(i) { + pool = per_cpu_ptr(fs_info->reserved_space_pool, i); + spin_lock(&pool->lock); + pool->used_bytes = 0; + + /* + * Ok the idea here is that we want to skew the spreading of the + * available space based on how it's being used across the + * processors. So here's how this works + * + * 1) if the total number of bytes we have is more than this + * pool has reserved, and this pool has reserved bytes, just + * give it the number of reserved bytes it has. + * + * 2) if the pool has no reserved bytes, give it the per_pool + * amount. You could just give it 0, and in some cases it works + * fine (single threaded cases), and in some cases it doesn't + * (multi-threaded cases). Giving it 0 versus not in the single + * threaded case doesn't make a difference, so give it hte per + * pool. + * + * 3) if total is less than the per pool amount, just give the + * pool the rest of the space. + */ + if (total >= pool->reserved_bytes) { + if (pool->reserved_bytes) { + pool->total_bytes = pool->reserved_bytes; + total -= pool->reserved_bytes; + } else if (total >= per_pool) { + pool->total_bytes = per_pool; + total -= per_pool; + } else { + pool->total_bytes = total; + total = 0; + } + } else { + if (total >= per_pool) { + pool->total_bytes = per_pool; + total -= per_pool; + } else { + pool->total_bytes = total; + total = 0; + } + } + spin_unlock(&pool->lock); + } + + /* + * If there's any space left over, just give it to the guy that we're + * currently on, since we're likely to be doing work soon anyway. + */ + if (total) { + pool = per_cpu_ptr(fs_info->reserved_space_pool, raw_smp_processor_id()); + spin_lock(&pool->lock); + pool->total_bytes += total; + spin_unlock(&pool->lock); + } } /* @@ -4638,6 +4881,7 @@ again: int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { + struct btrfs_reserved_space_pool *pool; struct btrfs_block_group_cache *cache; int ret = 0; @@ -4654,6 +4898,30 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) update_reserved_extents(cache, len, 0); btrfs_put_block_group(cache); + pool = per_cpu_ptr(root->fs_info->reserved_space_pool, + raw_smp_processor_id()); + spin_lock(&pool->lock); + if (pool->used_bytes < len) { + int i; + spin_unlock(&pool->lock); + for_each_possible_cpu(i) { + if (i == raw_smp_processor_id()) + continue; + pool = per_cpu_ptr(root->fs_info->reserved_space_pool, + i); + spin_lock(&pool->lock); + if (pool->used_bytes >= len) { + pool->used_bytes -= len; + spin_unlock(&pool->lock); + break; + } + spin_unlock(&pool->lock); + } + } else { + pool->used_bytes -= len; + spin_unlock(&pool->lock); + } + return ret; } @@ -4967,6 +5235,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) { + struct btrfs_reserved_space_pool *pool; struct btrfs_key ins; int ret; struct extent_buffer *buf; @@ -4978,6 +5247,12 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } + pool = per_cpu_ptr(root->fs_info->reserved_space_pool, + raw_smp_processor_id()); + spin_lock(&pool->lock); + pool->used_bytes += ins.offset; + spin_unlock(&pool->lock); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); return buf; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c0c462f..740735c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -853,7 +853,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, /* do the reserve before the mutex lock in case we have to do some * flushing. We wouldn't deadlock, but this is more polite. */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + err = btrfs_reserve_metadata_for_delalloc(root, inode, count); if (err) goto out_nolock; @@ -973,7 +973,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_unreserve_metadata_for_delalloc(root, inode, count); out_nolock: kfree(pages); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bc7e4d9..7162f91 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1230,19 +1230,40 @@ static int btrfs_split_extent_hook(struct inode *inode, size = orig->end - orig->start + 1; if (size > root->fs_info->max_extent) { - u64 num_extents; - u64 new_size; + u64 left_extents, right_extents; + u64 orig_extents; + u64 left_size, right_size; - new_size = orig->end - split + 1; - num_extents = div64_u64(size + root->fs_info->max_extent - 1, + left_size = orig->end - split + 1; + right_size = split - orig->start; + left_extents = div64_u64(left_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + right_extents = div64_u64(right_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + orig_extents = div64_u64(size + root->fs_info->max_extent - 1, root->fs_info->max_extent); /* - * if we break a large extent up then leave oustanding_extents - * be, since we've already accounted for the large extent. + * If we are splitting off a max_extent multiple of space, then + * we don't need to account for another extent. However if we + * take off less than a max_extent, we need to add another + * outstanding_extent, because we will likely still have the + * original amount of extents needed for the big chunk left + * over. + * + * Think of it this way. We have 4 outstanding extents for the + * original giant chunk of data. If we split off a max_extent + * sized chunk, then the remaining chunk has 3 outstanding + * extens, and the new little chunk has 1 outstanding extents + * worth, so we're even. + * + * But if we carve off < max_extent, thats its own extent still. + * So the left over chunk still needs 4 extents to describe it + * on disk, and the chunk we just split off needs 1 extent, so + * thats 5 total extents, so we need to add 1 to + * outstanding_extents. */ - if (div64_u64(new_size + root->fs_info->max_extent - 1, - root->fs_info->max_extent) < num_extents) + if (left_extents + right_extents == orig_extents) return 0; } @@ -1264,18 +1285,16 @@ static int btrfs_merge_extent_hook(struct inode *inode, struct extent_state *other) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 new_size, old_size; - u64 num_extents; + u64 new_size, size1, size2; + u64 extents1, extents2; /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return 0; - old_size = other->end - other->start + 1; - if (new->start < other->start) - new_size = other->end - new->start + 1; - else - new_size = new->end - other->start + 1; + size1 = other->end - other->start + 1; + size2 = new->end - new->start + 1; + new_size = size1 + size2; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= root->fs_info->max_extent) { @@ -1285,14 +1304,23 @@ static int btrfs_merge_extent_hook(struct inode *inode, return 0; } + extents1 = div64_u64(size1 + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + extents2 = div64_u64(size2 + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + /* - * If we grew by another max_extent, just return, we want to keep that - * reserved amount. + * So if the number of extents required for the two chunks of space we + * are merging equals the number of extents we need for the entire space + * we now have, just return. + * + * The idea here is if max extent is say 1M, and the first chunk was + * exactly 1M and the second chunk was 4k, we now need 2 extents to + * cover that area, so we keep the reservation we got from adding the 4k + * extent. */ - num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, - root->fs_info->max_extent); if (div64_u64(new_size + root->fs_info->max_extent - 1, - root->fs_info->max_extent) > num_extents) + root->fs_info->max_extent) == extents1 + extents2) return 0; spin_lock(&BTRFS_I(inode)->accounting_lock); @@ -1318,9 +1346,12 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + int extents = (int)div64_u64(end - start + 1 + + root->fs_info->max_extent - 1, + root->fs_info->max_extent); spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; + BTRFS_I(inode)->outstanding_extents += extents; spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); @@ -1350,10 +1381,22 @@ static int btrfs_clear_bit_hook(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; if (bits & EXTENT_DO_ACCOUNTING) { + int extents = (int) + div64_u64(state->end - state->start + 1 + + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + int bug = 0; spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents >= extents) + BTRFS_I(inode)->outstanding_extents -= extents; + else + bug = 1; spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + BUG_ON(bug); + btrfs_unreserve_metadata_for_delalloc(root, inode, + state->end - + state->start + + 1); } spin_lock(&root->fs_info->delalloc_lock); @@ -3100,7 +3143,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if (ret) goto out; - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_reserve_metadata_for_delalloc(root, inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -3109,7 +3152,7 @@ again: page = grab_cache_page(mapping, index); if (!page) { btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_unreserve_metadata_for_delalloc(root, inode, PAGE_CACHE_SIZE); goto out; } @@ -3173,7 +3216,7 @@ again: out_unlock: if (ret) btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_unreserve_metadata_for_delalloc(root, inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -5086,7 +5129,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_reserve_metadata_for_delalloc(root, inode, PAGE_CACHE_SIZE); if (ret) { btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); ret = VM_FAULT_SIGBUS; @@ -5170,7 +5213,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_unreserve_metadata_for_delalloc(root, inode, PAGE_CACHE_SIZE); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 31c6117..c0ac302 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1274,6 +1274,49 @@ out: return ret; } +long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_info space; + struct btrfs_ioctl_space_info *dest; + struct btrfs_space_info *info; + int ret = 0; + + if (copy_from_user(&space_args, + (struct btrfs_ioctl_space_args __user *)arg, + sizeof(space_args))) + return -EFAULT; + + space_args.total_spaces = 0; + dest = (struct btrfs_ioctl_space_info *) + (arg + sizeof(struct btrfs_ioctl_space_args)); + + rcu_read_lock(); + list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { + if (!space_args.space_slots) { + space_args.total_spaces++; + continue; + } + if (space_args.total_spaces >= space_args.space_slots) + break; + space.flags = info->flags; + space.total_bytes = info->total_bytes; + space.used_bytes = info->bytes_used; + if (copy_to_user(dest, &space, sizeof(space))) { + ret = -EFAULT; + break; + } + dest++; + space_args.total_spaces++; + } + rcu_read_unlock(); + + if (copy_to_user(arg, &space_args, sizeof(space_args))) + ret = -EFAULT; + + return ret; +} + /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -1338,6 +1381,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SPACE_INFO: + return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index bc49914..2dcc498 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -36,6 +36,18 @@ struct btrfs_ioctl_clone_range_args { __u64 dest_offset; }; +struct btrfs_ioctl_space_info { + u32 flags; + u64 total_bytes; + u64 used_bytes; +}; + +struct btrfs_ioctl_space_args { + u64 space_slots; + u64 total_spaces; + struct btrfs_ioctl_space_info spaces[0]; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -67,4 +79,6 @@ struct btrfs_ioctl_clone_range_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 18, \ + struct btrfs_ioctl_space_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 724d73f..30b2c9c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -303,7 +303,9 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; + int bug = 0; tree = &BTRFS_I(inode)->ordered_tree; node = &entry->rb_node; @@ -311,13 +313,22 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + /* + * Since we limit ordered extents to root->max_extent we can just + * subtract 1 from outstanding extents, and send max_extent to + * unreserve_metadata_for_delalloc and everything will be a-ok. + */ spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents) + BTRFS_I(inode)->outstanding_extents--; + else + bug = 1; spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); + BUG_ON(bug); + btrfs_unreserve_metadata_for_delalloc(root, inode, + root->fs_info->max_extent); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); /* @@ -329,7 +340,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { list_del_init(&BTRFS_I(inode)->ordered_operations); } - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_extent_lock); return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b2acc79..877b3c9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1060,6 +1060,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_prepare_extent_commit(trans, root); + btrfs_init_space_pools(root->fs_info); + cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL;