Message ID | 20180928111821.24376-7-josef@toxicpanda.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | My current patch queue | expand |
On 28.09.2018 14:17, Josef Bacik wrote: > From: Josef Bacik <jbacik@fb.com> > > Traditionally we've had voodoo in btrfs to account for the space that > delayed refs may take up by having a global_block_rsv. This works most > of the time, except when it doesn't. We've had issues reported and seen > in production where sometimes the global reserve is exhausted during > transaction commit before we can run all of our delayed refs, resulting > in an aborted transaction. Because of this voodoo we have equally > dubious flushing semantics around throttling delayed refs which we often > get wrong. > > So instead give them their own block_rsv. This way we can always know > exactly how much outstanding space we need for delayed refs. This > allows us to make sure we are constantly filling that reservation up > with space, and allows us to put more precise pressure on the enospc > system. Instead of doing math to see if its a good time to throttle, > the normal enospc code will be invoked if we have a lot of delayed refs > pending, and they will be run via the normal flushing mechanism. > > For now the delayed_refs_rsv will hold the reservations for the delayed > refs, the block group updates, and deleting csums. We could have a > separate rsv for the block group updates, but the csum deletion stuff is > still handled via the delayed_refs so that will stay there. > > Signed-off-by: Josef Bacik <jbacik@fb.com> > --- > fs/btrfs/ctree.h | 27 +++-- > fs/btrfs/delayed-ref.c | 28 ++++- > fs/btrfs/disk-io.c | 4 + > fs/btrfs/extent-tree.c | 279 +++++++++++++++++++++++++++++++++++-------- > fs/btrfs/inode.c | 2 +- > fs/btrfs/transaction.c | 77 ++++++------ > include/trace/events/btrfs.h | 2 + > 7 files changed, 312 insertions(+), 107 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 66f1d3895bca..1a2c3b629af2 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -452,8 +452,9 @@ struct btrfs_space_info { > #define BTRFS_BLOCK_RSV_TRANS 3 > #define BTRFS_BLOCK_RSV_CHUNK 4 > #define BTRFS_BLOCK_RSV_DELOPS 5 > -#define BTRFS_BLOCK_RSV_EMPTY 6 > -#define BTRFS_BLOCK_RSV_TEMP 7 > +#define BTRFS_BLOCK_RSV_DELREFS 6 > +#define BTRFS_BLOCK_RSV_EMPTY 7 > +#define BTRFS_BLOCK_RSV_TEMP 8 > > struct btrfs_block_rsv { > u64 size; > @@ -794,6 +795,8 @@ struct btrfs_fs_info { > struct btrfs_block_rsv chunk_block_rsv; > /* block reservation for delayed operations */ > struct btrfs_block_rsv delayed_block_rsv; > + /* block reservation for delayed refs */ > + struct btrfs_block_rsv delayed_refs_rsv; > > struct btrfs_block_rsv empty_block_rsv; > > @@ -2608,8 +2611,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, > > int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, > struct btrfs_fs_info *fs_info); > -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info); > +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); > void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, > const u64 start); > void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); > @@ -2723,10 +2725,12 @@ enum btrfs_reserve_flush_enum { > enum btrfs_flush_state { > FLUSH_DELAYED_ITEMS_NR = 1, > FLUSH_DELAYED_ITEMS = 2, > - FLUSH_DELALLOC = 3, > - FLUSH_DELALLOC_WAIT = 4, > - ALLOC_CHUNK = 5, > - COMMIT_TRANS = 6, > + FLUSH_DELAYED_REFS_NR = 3, > + FLUSH_DELAYED_REFS = 4, > + FLUSH_DELALLOC = 5, > + FLUSH_DELALLOC_WAIT = 6, > + ALLOC_CHUNK = 7, > + COMMIT_TRANS = 8, > }; > > int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); > @@ -2777,6 +2781,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, > void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, > struct btrfs_block_rsv *block_rsv, > u64 num_bytes); > +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); > +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); > +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info, > + enum btrfs_reserve_flush_enum flush); > +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, > + struct btrfs_block_rsv *src, > + u64 num_bytes); > int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); > void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); > void btrfs_put_block_group_cache(struct btrfs_fs_info *info); > diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c > index 27f7dd4e3d52..96ce087747b2 100644 > --- a/fs/btrfs/delayed-ref.c > +++ b/fs/btrfs/delayed-ref.c > @@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, > * existing and update must have the same bytenr > */ > static noinline void > -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, > +update_existing_head_ref(struct btrfs_trans_handle *trans, > struct btrfs_delayed_ref_head *existing, > struct btrfs_delayed_ref_head *update, > int *old_ref_mod_ret) > { > + struct btrfs_delayed_ref_root *delayed_refs = > + &trans->transaction->delayed_refs; > + struct btrfs_fs_info *fs_info = trans->fs_info; > int old_ref_mod; > > BUG_ON(existing->is_data != update->is_data); > @@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, > * versa we need to make sure to adjust pending_csums accordingly. > */ > if (existing->is_data) { > - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) > + u64 csum_items = > + btrfs_csum_bytes_to_leaves(fs_info, > + existing->num_bytes); > + > + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { > delayed_refs->pending_csums -= existing->num_bytes; > - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) > + btrfs_delayed_refs_rsv_release(fs_info, csum_items); > + } > + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { > delayed_refs->pending_csums += existing->num_bytes; > + trans->delayed_ref_updates += csum_items; > + } > } > spin_unlock(&existing->lock); > } > @@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, > && head_ref->qgroup_reserved > && existing->qgroup_ref_root > && existing->qgroup_reserved); > - update_existing_head_ref(delayed_refs, existing, head_ref, > + update_existing_head_ref(trans, existing, head_ref, > old_ref_mod); > /* > * we've updated the existing ref, free the newly > @@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, > } else { > if (old_ref_mod) > *old_ref_mod = 0; > - if (head_ref->is_data && head_ref->ref_mod < 0) > + if (head_ref->is_data && head_ref->ref_mod < 0) { > delayed_refs->pending_csums += head_ref->num_bytes; > + trans->delayed_ref_updates += > + btrfs_csum_bytes_to_leaves(trans->fs_info, > + head_ref->num_bytes); > + } > delayed_refs->num_heads++; > delayed_refs->num_heads_ready++; > atomic_inc(&delayed_refs->num_entries); > @@ -785,6 +800,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, > > ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); > spin_unlock(&delayed_refs->lock); > + btrfs_update_delayed_refs_rsv(trans); You haven't adressed my initial point about merging modification of delayed_ref_updates and calling btrfs_update_delayed_refs_rsv into one function otherwise this seems error prone. I don't see why this cannot be made, if there is some reason which I'm missing then explain it. As it stands this btrfs_updated_delayed_refs_rsv is paired with the modifications made in one of the 2nd level callees: btrfs_add_delayed_tree_ref add_delayed_ref_head update_existing_head_ref I'd rather have btrfs_update_delayed_refs_rsv renamed to something else with 'inc' in its name and called everytime we modify delayed_ref_update. I'm willing to bet 50 bucks in 6 months time someone will change delayed_ref_updates and will forget to call btrfs_update_delayed_refs_rsv. WRT locking in update_existing_head_ref we are guaranteed to hold delayed_refs->lock, same thing in add_delayed_extent_op. The only places where we don't hold it is in the bg-related paths. But that's easily solvable by simplying breaking the function down into an internal helper doing the actual work with lockdep_assert_held(delayed_refs) at the top and a "public" api which will be taking the lock and calling the helper. WRT performance you will not be putting that much extra code in the critical section i.e the check + the arithmetic of btrfs_calc_trans_metadata_size. > > trace_add_delayed_tree_ref(fs_info, &ref->node, ref, > action == BTRFS_ADD_DELAYED_EXTENT ? > @@ -866,6 +882,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, > > ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); > spin_unlock(&delayed_refs->lock); > + btrfs_update_delayed_refs_rsv(trans); > > trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, > action == BTRFS_ADD_DELAYED_EXTENT ? > @@ -903,6 +920,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, > NULL, NULL, NULL); > > spin_unlock(&delayed_refs->lock); > + btrfs_update_delayed_refs_rsv(trans); > return 0; > } > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 5124c15705ce..377ad9c1cb17 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -2692,6 +2692,9 @@ int open_ctree(struct super_block *sb, > btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); > btrfs_init_block_rsv(&fs_info->delayed_block_rsv, > BTRFS_BLOCK_RSV_DELOPS); > + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, > + BTRFS_BLOCK_RSV_DELREFS); > + > atomic_set(&fs_info->async_delalloc_pages, 0); > atomic_set(&fs_info->defrag_running, 0); > atomic_set(&fs_info->qgroup_op_seq, 0); > @@ -4419,6 +4422,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, > > spin_unlock(&cur_trans->dirty_bgs_lock); > btrfs_put_block_group(cache); > + btrfs_delayed_refs_rsv_release(fs_info, 1); > spin_lock(&cur_trans->dirty_bgs_lock); > } > spin_unlock(&cur_trans->dirty_bgs_lock); > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index b32bd38390dd..1213f573eea2 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -2481,6 +2481,7 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, > struct btrfs_fs_info *fs_info = trans->fs_info; > struct btrfs_delayed_ref_root *delayed_refs = > &trans->transaction->delayed_refs; > + int nr_items = 1; > > if (head->total_ref_mod < 0) { > struct btrfs_space_info *space_info; > @@ -2502,12 +2503,15 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, > spin_lock(&delayed_refs->lock); > delayed_refs->pending_csums -= head->num_bytes; > spin_unlock(&delayed_refs->lock); > + nr_items += btrfs_csum_bytes_to_leaves(fs_info, > + head->num_bytes); > } > } > > /* Also free its reserved qgroup space */ > btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, > head->qgroup_reserved); > + btrfs_delayed_refs_rsv_release(fs_info, nr_items); > } > > static int cleanup_ref_head(struct btrfs_trans_handle *trans, > @@ -2802,40 +2806,22 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) > return num_csums; > } > > -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, > - struct btrfs_fs_info *fs_info) > +bool btrfs_check_space_for_delayed_refs( struct btrfs_fs_info *fs_info) > { > - struct btrfs_block_rsv *global_rsv; > - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; > - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; > - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; > - u64 num_bytes, num_dirty_bgs_bytes; > - int ret = 0; > - > - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); > - num_heads = heads_to_leaves(fs_info, num_heads); > - if (num_heads > 1) > - num_bytes += (num_heads - 1) * fs_info->nodesize; > - num_bytes <<= 1; > - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * > - fs_info->nodesize; > - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, > - num_dirty_bgs); > - global_rsv = &fs_info->global_block_rsv; > - > - /* > - * If we can't allocate any more chunks lets make sure we have _lots_ of > - * wiggle room since running delayed refs can create more delayed refs. > - */ > - if (global_rsv->space_info->full) { > - num_dirty_bgs_bytes <<= 1; > - num_bytes <<= 1; > - } > + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; > + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; > + u64 reserved; > + bool ret = false; > > spin_lock(&global_rsv->lock); > - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) > - ret = 1; > + reserved = global_rsv->reserved; > spin_unlock(&global_rsv->lock); > + > + spin_lock(&delayed_refs_rsv->lock); > + reserved += delayed_refs_rsv->reserved; > + if (delayed_refs_rsv->size >= reserved) > + ret = true; > + spin_unlock(&delayed_refs_rsv->lock); > return ret; > } > > @@ -2855,7 +2841,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, > if (val >= NSEC_PER_SEC / 2) > return 2; > > - return btrfs_check_space_for_delayed_refs(trans, fs_info); > + return btrfs_check_space_for_delayed_refs(fs_info) ? 1 : 0; > } > > struct async_delayed_refs { > @@ -3610,6 +3596,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) > */ > mutex_lock(&trans->transaction->cache_write_mutex); > while (!list_empty(&dirty)) { > + bool drop_reserve = true; > + > cache = list_first_entry(&dirty, > struct btrfs_block_group_cache, > dirty_list); > @@ -3682,6 +3670,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) > list_add_tail(&cache->dirty_list, > &cur_trans->dirty_bgs); > btrfs_get_block_group(cache); > + drop_reserve = false; > } > spin_unlock(&cur_trans->dirty_bgs_lock); > } else if (ret) { > @@ -3692,6 +3681,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) > /* if its not on the io list, we need to put the block group */ > if (should_put) > btrfs_put_block_group(cache); > + if (drop_reserve) > + btrfs_delayed_refs_rsv_release(fs_info, 1); > > if (ret) > break; > @@ -3840,6 +3831,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, > /* if its not on the io list, we need to put the block group */ > if (should_put) > btrfs_put_block_group(cache); > + btrfs_delayed_refs_rsv_release(fs_info, 1); > spin_lock(&cur_trans->dirty_bgs_lock); > } > spin_unlock(&cur_trans->dirty_bgs_lock); > @@ -4816,8 +4808,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, > { > struct reserve_ticket *ticket = NULL; > struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; > + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; > struct btrfs_trans_handle *trans; > u64 bytes; > + u64 reclaim_bytes = 0; > > trans = (struct btrfs_trans_handle *)current->journal_info; > if (trans) > @@ -4850,12 +4844,16 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, > return -ENOSPC; > > spin_lock(&delayed_rsv->lock); > - if (delayed_rsv->size > bytes) > - bytes = 0; > - else > - bytes -= delayed_rsv->size; > + reclaim_bytes += delayed_rsv->reserved; > spin_unlock(&delayed_rsv->lock); > > + spin_lock(&delayed_refs_rsv->lock); > + reclaim_bytes += delayed_refs_rsv->reserved; > + spin_unlock(&delayed_refs_rsv->lock); > + if (reclaim_bytes >= bytes) > + goto commit; > + bytes -= reclaim_bytes; > + > if (__percpu_counter_compare(&space_info->total_bytes_pinned, > bytes, > BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { > @@ -4905,6 +4903,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, > shrink_delalloc(fs_info, num_bytes * 2, num_bytes, > state == FLUSH_DELALLOC_WAIT); > break; > + case FLUSH_DELAYED_REFS_NR: > + case FLUSH_DELAYED_REFS: > + trans = btrfs_join_transaction(root); > + if (IS_ERR(trans)) { > + ret = PTR_ERR(trans); > + break; > + } > + if (state == FLUSH_DELAYED_REFS_NR) > + nr = calc_reclaim_items_nr(fs_info, num_bytes); > + else > + nr = 0; > + btrfs_run_delayed_refs(trans, nr); > + btrfs_end_transaction(trans); > + break; > case ALLOC_CHUNK: > trans = btrfs_join_transaction(root); > if (IS_ERR(trans)) { > @@ -5377,6 +5389,91 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, > return 0; > } > > +/** > + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. > + * @fs_info - the fs info for our fs. > + * @src - the source block rsv to transfer from. > + * @num_bytes - the number of bytes to transfer. > + * > + * This transfers up to the num_bytes amount from the src rsv to the > + * delayed_refs_rsv. Any extra bytes are returned to the space info. > + */ > +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, > + struct btrfs_block_rsv *src, > + u64 num_bytes) > +{ > + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; > + u64 to_free = 0; > + > + spin_lock(&src->lock); > + src->reserved -= num_bytes; > + src->size -= num_bytes; > + spin_unlock(&src->lock); > + > + spin_lock(&delayed_refs_rsv->lock); > + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { > + u64 delta = delayed_refs_rsv->size - > + delayed_refs_rsv->reserved; > + if (num_bytes > delta) { > + to_free = num_bytes - delta; > + num_bytes = delta; > + } > + } else { > + to_free = num_bytes; > + num_bytes = 0; > + } > + > + if (num_bytes) > + delayed_refs_rsv->reserved += num_bytes; > + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) > + delayed_refs_rsv->full = 1; > + spin_unlock(&delayed_refs_rsv->lock); > + > + if (num_bytes) > + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", > + 0, num_bytes, 1); > + if (to_free) > + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, > + to_free); > +} > + > +/** > + * btrfs_throttle_delayed_refs - throttle based on our delayed refs usage. > + * @fs_info - the fs_info for our fs. > + * @flush - control how we can flush for this reservation. > + * > + * This will refill the delayed block_rsv up to 1 items size worth of space and > + * will return -ENOSPC if we can't make the reservation. > + */ > +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info, > + enum btrfs_reserve_flush_enum flush) > +{ > + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; > + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); > + u64 num_bytes = 0; > + int ret = -ENOSPC; > + > + spin_lock(&block_rsv->lock); > + if (block_rsv->reserved < block_rsv->size) { > + num_bytes = block_rsv->size - block_rsv->reserved; > + num_bytes = min(num_bytes, limit); > + } > + spin_unlock(&block_rsv->lock); > + > + if (!num_bytes) > + return 0; > + > + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, > + num_bytes, flush); > + if (ret) > + return ret; > + block_rsv_add_bytes(block_rsv, num_bytes, 0); > + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", > + 0, num_bytes, 1); > + return 0; > +} > + > + > /* > * This is for space we already have accounted in space_info->bytes_may_use, so > * basically when we're returning space from block_rsv's. > @@ -5699,6 +5796,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, > return ret; > } > > +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, > + struct btrfs_block_rsv *block_rsv, > + u64 num_bytes, u64 *qgroup_to_release) > +{ > + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; > + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; > + struct btrfs_block_rsv *target = delayed_rsv; > + > + if (target->full || target == block_rsv) > + target = global_rsv; > + > + if (block_rsv->space_info != target->space_info) > + target = NULL; > + > + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, > + qgroup_to_release); > +} > + > +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, > + struct btrfs_block_rsv *block_rsv, > + u64 num_bytes) > +{ > + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); > +} > + > /** > * btrfs_inode_rsv_release - release any excessive reservation. > * @inode - the inode we need to release from. > @@ -5713,7 +5835,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, > static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) > { > struct btrfs_fs_info *fs_info = inode->root->fs_info; > - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; > struct btrfs_block_rsv *block_rsv = &inode->block_rsv; > u64 released = 0; > u64 qgroup_to_release = 0; > @@ -5723,8 +5844,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) > * are releasing 0 bytes, and then we'll just get the reservation over > * the size free'd. > */ > - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, > - &qgroup_to_release); > + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, > + &qgroup_to_release); > if (released > 0) > trace_btrfs_space_reservation(fs_info, "delalloc", > btrfs_ino(inode), released, 0); > @@ -5735,16 +5856,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) > qgroup_to_release); > } > > -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, > - struct btrfs_block_rsv *block_rsv, > - u64 num_bytes) > +/** > + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. > + * @fs_info - the fs_info for our fs. > + * @nr - the number of items to drop. > + * > + * This drops the delayed ref head's count from the delayed refs rsv and free's > + * any excess reservation we had. > + */ > +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) > { > + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; > struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; > + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); > + u64 released = 0; > > - if (global_rsv == block_rsv || > - block_rsv->space_info != global_rsv->space_info) > - global_rsv = NULL; > - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); > + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, > + num_bytes, NULL); > + if (released) > + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", > + 0, released, 0); > } > > static void update_global_block_rsv(struct btrfs_fs_info *fs_info) > @@ -5809,9 +5940,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) > fs_info->trans_block_rsv.space_info = space_info; > fs_info->empty_block_rsv.space_info = space_info; > fs_info->delayed_block_rsv.space_info = space_info; > + fs_info->delayed_refs_rsv.space_info = space_info; > > - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; > - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; > + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; > + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; > fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; > fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; > if (fs_info->quota_root) > @@ -5831,8 +5963,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) > WARN_ON(fs_info->chunk_block_rsv.reserved > 0); > WARN_ON(fs_info->delayed_block_rsv.size > 0); > WARN_ON(fs_info->delayed_block_rsv.reserved > 0); > + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); > + WARN_ON(fs_info->delayed_refs_rsv.size > 0); > } > > +/* > + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv > + * @trans - the trans that may have generated delayed refs > + * > + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, > + * it'll calculate the additional size and add it to the delayed_refs_rsv. > + */ > +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) > +{ > + struct btrfs_fs_info *fs_info = trans->fs_info; > + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; > + u64 num_bytes; > + > + if (!trans->delayed_ref_updates) > + return; > + > + num_bytes = btrfs_calc_trans_metadata_size(fs_info, > + trans->delayed_ref_updates); > + spin_lock(&delayed_rsv->lock); > + delayed_rsv->size += num_bytes; > + delayed_rsv->full = 0; > + spin_unlock(&delayed_rsv->lock); > + trans->delayed_ref_updates = 0; > +} > > /* > * To be called after all the new block groups attached to the transaction > @@ -6126,6 +6284,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, > u64 old_val; > u64 byte_in_group; > int factor; > + int ret = 0; > > /* block accounting for super block */ > spin_lock(&info->delalloc_root_lock); > @@ -6139,8 +6298,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, > > while (total) { > cache = btrfs_lookup_block_group(info, bytenr); > - if (!cache) > - return -ENOENT; > + if (!cache) { > + ret = -ENOENT; > + break; > + } > factor = btrfs_bg_type_to_factor(cache->flags); > > /* > @@ -6199,6 +6360,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, > list_add_tail(&cache->dirty_list, > &trans->transaction->dirty_bgs); > trans->transaction->num_dirty_bgs++; > + trans->delayed_ref_updates++; > btrfs_get_block_group(cache); > } > spin_unlock(&trans->transaction->dirty_bgs_lock); > @@ -6216,7 +6378,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, > total -= num_bytes; > bytenr += num_bytes; > } > - return 0; > + > + /* Modified block groups are accounted for in the delayed_refs_rsv. */ > + btrfs_update_delayed_refs_rsv(trans); > + return ret; > } > > static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) > @@ -8230,7 +8395,12 @@ use_block_rsv(struct btrfs_trans_handle *trans, > goto again; > } > > - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { > + /* > + * The global reserve still exists to save us from ourselves, so don't > + * warn_on if we are short on our delayed refs reserve. > + */ > + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && > + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { > static DEFINE_RATELIMIT_STATE(_rs, > DEFAULT_RATELIMIT_INTERVAL * 10, > /*DEFAULT_RATELIMIT_BURST*/ 1); > @@ -10146,6 +10316,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) > add_block_group_free_space(trans, block_group); > /* already aborted the transaction if it failed. */ > next: > + btrfs_delayed_refs_rsv_release(fs_info, 1); > list_del_init(&block_group->bg_list); > } > trans->can_flush_pending_bgs = can_flush_pending_bgs; > @@ -10223,6 +10394,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, > link_block_group(cache); > > list_add_tail(&cache->bg_list, &trans->new_bgs); > + trans->delayed_ref_updates++; > + btrfs_update_delayed_refs_rsv(trans); > > set_avail_alloc_bits(fs_info, type); > return 0; > @@ -10260,6 +10433,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, > int factor; > struct btrfs_caching_control *caching_ctl = NULL; > bool remove_em; > + bool remove_rsv = false; > > block_group = btrfs_lookup_block_group(fs_info, group_start); > BUG_ON(!block_group); > @@ -10324,6 +10498,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, > > if (!list_empty(&block_group->dirty_list)) { > list_del_init(&block_group->dirty_list); > + remove_rsv = true; > btrfs_put_block_group(block_group); > } > spin_unlock(&trans->transaction->dirty_bgs_lock); > @@ -10533,6 +10708,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, > > ret = btrfs_del_item(trans, root, path); > out: > + if (remove_rsv) > + btrfs_delayed_refs_rsv_release(fs_info, 1); > btrfs_free_path(path); > return ret; > } > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 212fa71317d6..cd00ec869c96 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -5382,7 +5382,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, > * Try to steal from the global reserve if there is space for > * it. > */ > - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && > + if (!btrfs_check_space_for_delayed_refs(fs_info) && > !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) > return trans; > > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 3b84f5015029..117e0c4a914a 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -455,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, > bool enforce_qgroups) > { > struct btrfs_fs_info *fs_info = root->fs_info; > - > + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; > struct btrfs_trans_handle *h; > struct btrfs_transaction *cur_trans; > u64 num_bytes = 0; > @@ -484,13 +484,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, > * the appropriate flushing if need be. > */ > if (num_items && root != fs_info->chunk_root) { > + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; > + u64 delayed_refs_bytes = 0; > + > qgroup_reserved = num_items * fs_info->nodesize; > ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, > enforce_qgroups); > if (ret) > return ERR_PTR(ret); > > + /* > + * We want to reserve all the bytes we may need all at once, so > + * we only do 1 enospc flushing cycle per transaction start. We > + * accomplish this by simply assuming we'll do 2 x num_items > + * worth of delayed refs updates in this trans handle, and > + * refill that amount for whatever is missing in the reserve. > + */ > num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); > + if (delayed_refs_rsv->full == 0) { > + delayed_refs_bytes = num_bytes; > + num_bytes <<= 1; > + } > + > /* > * Do the reservation for the relocation root creation > */ > @@ -499,8 +514,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, > reloc_reserved = true; > } > > - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, > - num_bytes, flush); > + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); > + if (ret) > + goto reserve_fail; > + if (delayed_refs_bytes) { > + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, > + delayed_refs_bytes); > + num_bytes -= delayed_refs_bytes; > + } > + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && > + !delayed_refs_rsv->full) { > + /* > + * Some people call with btrfs_start_transaction(root, 0) > + * because they can be throttled, but have some other mechanism > + * for reserving space. We still want these guys to refill the > + * delayed block_rsv so just add 1 items worth of reservation > + * here. > + */ > + ret = btrfs_throttle_delayed_refs(fs_info, flush); > if (ret) > goto reserve_fail; > } > @@ -759,7 +790,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) > { > struct btrfs_fs_info *fs_info = trans->fs_info; > > - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) > + if (btrfs_check_space_for_delayed_refs(fs_info)) > return 1; > > return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); > @@ -768,22 +799,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) > int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) > { > struct btrfs_transaction *cur_trans = trans->transaction; > - int updates; > - int err; > > smp_mb(); > if (cur_trans->state >= TRANS_STATE_BLOCKED || > cur_trans->delayed_refs.flushing) > return 1; > > - updates = trans->delayed_ref_updates; > - trans->delayed_ref_updates = 0; > - if (updates) { > - err = btrfs_run_delayed_refs(trans, updates * 2); > - if (err) /* Error code will also eval true */ > - return err; > - } > - > return should_end_transaction(trans); > } > > @@ -813,11 +834,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, > { > struct btrfs_fs_info *info = trans->fs_info; > struct btrfs_transaction *cur_trans = trans->transaction; > - u64 transid = trans->transid; > - unsigned long cur = trans->delayed_ref_updates; > int lock = (trans->type != TRANS_JOIN_NOLOCK); > int err = 0; > - int must_run_delayed_refs = 0; > > if (refcount_read(&trans->use_count) > 1) { > refcount_dec(&trans->use_count); > @@ -828,27 +846,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, > btrfs_trans_release_metadata(trans); > trans->block_rsv = NULL; > > - if (!list_empty(&trans->new_bgs)) > - btrfs_create_pending_block_groups(trans); > - > - trans->delayed_ref_updates = 0; > - if (!trans->sync) { > - must_run_delayed_refs = > - btrfs_should_throttle_delayed_refs(trans, info); > - cur = max_t(unsigned long, cur, 32); > - > - /* > - * don't make the caller wait if they are from a NOLOCK > - * or ATTACH transaction, it will deadlock with commit > - */ > - if (must_run_delayed_refs == 1 && > - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) > - must_run_delayed_refs = 2; > - } > - > - btrfs_trans_release_metadata(trans); > - trans->block_rsv = NULL; > - > if (!list_empty(&trans->new_bgs)) > btrfs_create_pending_block_groups(trans); > > @@ -893,10 +890,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, > } > > kmem_cache_free(btrfs_trans_handle_cachep, trans); > - if (must_run_delayed_refs) { > - btrfs_async_run_delayed_refs(info, cur, transid, > - must_run_delayed_refs == 1); > - } > return err; > } > > diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h > index b401c4e36394..7d205e50b09c 100644 > --- a/include/trace/events/btrfs.h > +++ b/include/trace/events/btrfs.h > @@ -1048,6 +1048,8 @@ TRACE_EVENT(btrfs_trigger_flush, > { FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS"}, \ > { FLUSH_DELALLOC, "FLUSH_DELALLOC"}, \ > { FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT"}, \ > + { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ > + { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ > { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ > { COMMIT_TRANS, "COMMIT_TRANS"}) > >
On Fri, Sep 28, 2018 at 02:51:10PM +0300, Nikolay Borisov wrote: > > > On 28.09.2018 14:17, Josef Bacik wrote: > > From: Josef Bacik <jbacik@fb.com> > > > > Traditionally we've had voodoo in btrfs to account for the space that > > delayed refs may take up by having a global_block_rsv. This works most > > of the time, except when it doesn't. We've had issues reported and seen > > in production where sometimes the global reserve is exhausted during > > transaction commit before we can run all of our delayed refs, resulting > > in an aborted transaction. Because of this voodoo we have equally > > dubious flushing semantics around throttling delayed refs which we often > > get wrong. > > > > So instead give them their own block_rsv. This way we can always know > > exactly how much outstanding space we need for delayed refs. This > > allows us to make sure we are constantly filling that reservation up > > with space, and allows us to put more precise pressure on the enospc > > system. Instead of doing math to see if its a good time to throttle, > > the normal enospc code will be invoked if we have a lot of delayed refs > > pending, and they will be run via the normal flushing mechanism. > > > > For now the delayed_refs_rsv will hold the reservations for the delayed > > refs, the block group updates, and deleting csums. We could have a > > separate rsv for the block group updates, but the csum deletion stuff is > > still handled via the delayed_refs so that will stay there. > > > > Signed-off-by: Josef Bacik <jbacik@fb.com> > > --- > > fs/btrfs/ctree.h | 27 +++-- > > fs/btrfs/delayed-ref.c | 28 ++++- > > fs/btrfs/disk-io.c | 4 + > > fs/btrfs/extent-tree.c | 279 +++++++++++++++++++++++++++++++++++-------- > > fs/btrfs/inode.c | 2 +- > > fs/btrfs/transaction.c | 77 ++++++------ > > include/trace/events/btrfs.h | 2 + > > 7 files changed, 312 insertions(+), 107 deletions(-) > > > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > > index 66f1d3895bca..1a2c3b629af2 100644 > > --- a/fs/btrfs/ctree.h > > +++ b/fs/btrfs/ctree.h > > @@ -452,8 +452,9 @@ struct btrfs_space_info { > > #define BTRFS_BLOCK_RSV_TRANS 3 > > #define BTRFS_BLOCK_RSV_CHUNK 4 > > #define BTRFS_BLOCK_RSV_DELOPS 5 > > -#define BTRFS_BLOCK_RSV_EMPTY 6 > > -#define BTRFS_BLOCK_RSV_TEMP 7 > > +#define BTRFS_BLOCK_RSV_DELREFS 6 > > +#define BTRFS_BLOCK_RSV_EMPTY 7 > > +#define BTRFS_BLOCK_RSV_TEMP 8 > > > > struct btrfs_block_rsv { > > u64 size; > > @@ -794,6 +795,8 @@ struct btrfs_fs_info { > > struct btrfs_block_rsv chunk_block_rsv; > > /* block reservation for delayed operations */ > > struct btrfs_block_rsv delayed_block_rsv; > > + /* block reservation for delayed refs */ > > + struct btrfs_block_rsv delayed_refs_rsv; > > > > struct btrfs_block_rsv empty_block_rsv; > > > > @@ -2608,8 +2611,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, > > > > int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, > > struct btrfs_fs_info *fs_info); > > -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, > > - struct btrfs_fs_info *fs_info); > > +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); > > void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, > > const u64 start); > > void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); > > @@ -2723,10 +2725,12 @@ enum btrfs_reserve_flush_enum { > > enum btrfs_flush_state { > > FLUSH_DELAYED_ITEMS_NR = 1, > > FLUSH_DELAYED_ITEMS = 2, > > - FLUSH_DELALLOC = 3, > > - FLUSH_DELALLOC_WAIT = 4, > > - ALLOC_CHUNK = 5, > > - COMMIT_TRANS = 6, > > + FLUSH_DELAYED_REFS_NR = 3, > > + FLUSH_DELAYED_REFS = 4, > > + FLUSH_DELALLOC = 5, > > + FLUSH_DELALLOC_WAIT = 6, > > + ALLOC_CHUNK = 7, > > + COMMIT_TRANS = 8, > > }; > > > > int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); > > @@ -2777,6 +2781,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, > > void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, > > struct btrfs_block_rsv *block_rsv, > > u64 num_bytes); > > +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); > > +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); > > +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info, > > + enum btrfs_reserve_flush_enum flush); > > +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, > > + struct btrfs_block_rsv *src, > > + u64 num_bytes); > > int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); > > void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); > > void btrfs_put_block_group_cache(struct btrfs_fs_info *info); > > diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c > > index 27f7dd4e3d52..96ce087747b2 100644 > > --- a/fs/btrfs/delayed-ref.c > > +++ b/fs/btrfs/delayed-ref.c > > @@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, > > * existing and update must have the same bytenr > > */ > > static noinline void > > -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, > > +update_existing_head_ref(struct btrfs_trans_handle *trans, > > struct btrfs_delayed_ref_head *existing, > > struct btrfs_delayed_ref_head *update, > > int *old_ref_mod_ret) > > { > > + struct btrfs_delayed_ref_root *delayed_refs = > > + &trans->transaction->delayed_refs; > > + struct btrfs_fs_info *fs_info = trans->fs_info; > > int old_ref_mod; > > > > BUG_ON(existing->is_data != update->is_data); > > @@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, > > * versa we need to make sure to adjust pending_csums accordingly. > > */ > > if (existing->is_data) { > > - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) > > + u64 csum_items = > > + btrfs_csum_bytes_to_leaves(fs_info, > > + existing->num_bytes); > > + > > + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { > > delayed_refs->pending_csums -= existing->num_bytes; > > - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) > > + btrfs_delayed_refs_rsv_release(fs_info, csum_items); > > + } > > + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { > > delayed_refs->pending_csums += existing->num_bytes; > > + trans->delayed_ref_updates += csum_items; > > + } > > } > > spin_unlock(&existing->lock); > > } > > @@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, > > && head_ref->qgroup_reserved > > && existing->qgroup_ref_root > > && existing->qgroup_reserved); > > - update_existing_head_ref(delayed_refs, existing, head_ref, > > + update_existing_head_ref(trans, existing, head_ref, > > old_ref_mod); > > /* > > * we've updated the existing ref, free the newly > > @@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, > > } else { > > if (old_ref_mod) > > *old_ref_mod = 0; > > - if (head_ref->is_data && head_ref->ref_mod < 0) > > + if (head_ref->is_data && head_ref->ref_mod < 0) { > > delayed_refs->pending_csums += head_ref->num_bytes; > > + trans->delayed_ref_updates += > > + btrfs_csum_bytes_to_leaves(trans->fs_info, > > + head_ref->num_bytes); > > + } > > delayed_refs->num_heads++; > > delayed_refs->num_heads_ready++; > > atomic_inc(&delayed_refs->num_entries); > > @@ -785,6 +800,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, > > > > ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); > > spin_unlock(&delayed_refs->lock); > > + btrfs_update_delayed_refs_rsv(trans); > > You haven't adressed my initial point about merging modification of > delayed_ref_updates and calling btrfs_update_delayed_refs_rsv into one > function otherwise this seems error prone. I don't see why this cannot > be made, if there is some reason which I'm missing then explain it. > > As it stands this btrfs_updated_delayed_refs_rsv is paired with the > modifications made in one of the 2nd level callees: > > btrfs_add_delayed_tree_ref > add_delayed_ref_head > update_existing_head_ref > > I'd rather have btrfs_update_delayed_refs_rsv renamed to something else > with 'inc' in its name and called everytime we modify > delayed_ref_update. I'm willing to bet 50 bucks in 6 months time someone > will change delayed_ref_updates and will forget to call > btrfs_update_delayed_refs_rsv. > Because we have to take the delayed_refs_rsv lock in this helper, I want to take it as little as possible since it is a fs wide lock, so I want to batch it. There's no reason to change it. Thanks, Josef
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 66f1d3895bca..1a2c3b629af2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -452,8 +452,9 @@ struct btrfs_space_info { #define BTRFS_BLOCK_RSV_TRANS 3 #define BTRFS_BLOCK_RSV_CHUNK 4 #define BTRFS_BLOCK_RSV_DELOPS 5 -#define BTRFS_BLOCK_RSV_EMPTY 6 -#define BTRFS_BLOCK_RSV_TEMP 7 +#define BTRFS_BLOCK_RSV_DELREFS 6 +#define BTRFS_BLOCK_RSV_EMPTY 7 +#define BTRFS_BLOCK_RSV_TEMP 8 struct btrfs_block_rsv { u64 size; @@ -794,6 +795,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; + /* block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -2608,8 +2611,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2723,10 +2725,12 @@ enum btrfs_reserve_flush_enum { enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + ALLOC_CHUNK = 7, + COMMIT_TRANS = 8, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2777,6 +2781,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 27f7dd4e3d52..96ce087747b2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, * existing and update must have the same bytenr */ static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * versa we need to make sure to adjust pending_csums accordingly. */ if (existing->is_data) { - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + u64 csum_items = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_delayed_refs_rsv_release(fs_info, csum_items); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_items; + } } spin_unlock(&existing->lock); } @@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, && head_ref->qgroup_reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, existing, head_ref, + update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } else { if (old_ref_mod) *old_ref_mod = 0; - if (head_ref->is_data && head_ref->ref_mod < 0) + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -785,6 +800,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + btrfs_update_delayed_refs_rsv(trans); trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? @@ -866,6 +882,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + btrfs_update_delayed_refs_rsv(trans); trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? @@ -903,6 +920,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); + btrfs_update_delayed_refs_rsv(trans); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5124c15705ce..377ad9c1cb17 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2692,6 +2692,9 @@ int open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); @@ -4419,6 +4422,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b32bd38390dd..1213f573eea2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2481,6 +2481,7 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; + int nr_items = 1; if (head->total_ref_mod < 0) { struct btrfs_space_info *space_info; @@ -2502,12 +2503,15 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); } } /* Also free its reserved qgroup space */ btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, head->qgroup_reserved); + btrfs_delayed_refs_rsv_release(fs_info, nr_items); } static int cleanup_ref_head(struct btrfs_trans_handle *trans, @@ -2802,40 +2806,22 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +bool btrfs_check_space_for_delayed_refs( struct btrfs_fs_info *fs_info) { - struct btrfs_block_rsv *global_rsv; - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; - u64 num_bytes, num_dirty_bgs_bytes; - int ret = 0; - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - num_heads = heads_to_leaves(fs_info, num_heads); - if (num_heads > 1) - num_bytes += (num_heads - 1) * fs_info->nodesize; - num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * - fs_info->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, - num_dirty_bgs); - global_rsv = &fs_info->global_block_rsv; - - /* - * If we can't allocate any more chunks lets make sure we have _lots_ of - * wiggle room since running delayed refs can create more delayed refs. - */ - if (global_rsv->space_info->full) { - num_dirty_bgs_bytes <<= 1; - num_bytes <<= 1; - } + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 reserved; + bool ret = false; spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) - ret = 1; + reserved = global_rsv->reserved; spin_unlock(&global_rsv->lock); + + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); return ret; } @@ -2855,7 +2841,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, fs_info); + return btrfs_check_space_for_delayed_refs(fs_info) ? 1 : 0; } struct async_delayed_refs { @@ -3610,6 +3596,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { + bool drop_reserve = true; + cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); @@ -3682,6 +3670,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); + drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { @@ -3692,6 +3681,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); if (ret) break; @@ -3840,6 +3831,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4816,8 +4808,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *trans; u64 bytes; + u64 reclaim_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) @@ -4850,12 +4844,16 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size > bytes) - bytes = 0; - else - bytes -= delayed_rsv->size; + reclaim_bytes += delayed_rsv->reserved; spin_unlock(&delayed_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes) + goto commit; + bytes -= reclaim_bytes; + if (__percpu_counter_compare(&space_info->total_bytes_pinned, bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { @@ -4905,6 +4903,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -5377,6 +5389,91 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, + to_free); +} + +/** + * btrfs_throttle_delayed_refs - throttle based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -5699,6 +5796,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = delayed_rsv; + + if (target->full || target == block_rsv) + target = global_rsv; + + if (block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -5713,7 +5835,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; @@ -5723,8 +5844,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, - &qgroup_to_release); + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); @@ -5735,16 +5856,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and free's + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; - if (global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, + num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5809,9 +5940,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; if (fs_info->quota_root) @@ -5831,8 +5963,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} /* * To be called after all the new block groups attached to the transaction @@ -6126,6 +6284,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; int factor; + int ret = 0; /* block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -6139,8 +6298,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; + if (!cache) { + ret = -ENOENT; + break; + } factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -6199,6 +6360,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); trans->transaction->num_dirty_bgs++; + trans->delayed_ref_updates++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6216,7 +6378,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, total -= num_bytes; bytenr += num_bytes; } - return 0; + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -8230,7 +8395,12 @@ use_block_rsv(struct btrfs_trans_handle *trans, goto again; } - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -10146,6 +10316,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: + btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); } trans->can_flush_pending_bgs = can_flush_pending_bgs; @@ -10223,6 +10394,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); set_avail_alloc_bits(fs_info, type); return 0; @@ -10260,6 +10433,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; + bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); @@ -10324,6 +10498,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); + remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10533,6 +10708,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 212fa71317d6..cd00ec869c96 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5382,7 +5382,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && + if (!btrfs_check_space_for_delayed_refs(fs_info) && !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) return trans; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b84f5015029..117e0c4a914a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -455,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; - + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -484,13 +484,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, enforce_qgroups); if (ret) return ERR_PTR(ret); + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do 2 x num_items + * worth of delayed refs updates in this trans handle, and + * refill that amount for whatever is missing in the reserve. + */ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + if (delayed_refs_rsv->full == 0) { + delayed_refs_bytes = num_bytes; + num_bytes <<= 1; + } + /* * Do the reservation for the relocation root creation */ @@ -499,8 +514,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - num_bytes, flush); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_throttle_delayed_refs(fs_info, flush); if (ret) goto reserve_fail; } @@ -759,7 +790,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -768,22 +799,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - int updates; - int err; smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED || cur_trans->delayed_refs.flushing) return 1; - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) /* Error code will also eval true */ - return err; - } - return should_end_transaction(trans); } @@ -813,11 +834,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - u64 transid = trans->transid; - unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - int must_run_delayed_refs = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -828,27 +846,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); - - trans->delayed_ref_updates = 0; - if (!trans->sync) { - must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, info); - cur = max_t(unsigned long, cur, 32); - - /* - * don't make the caller wait if they are from a NOLOCK - * or ATTACH transaction, it will deadlock with commit - */ - if (must_run_delayed_refs == 1 && - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) - must_run_delayed_refs = 2; - } - - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans); @@ -893,10 +890,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(info, cur, transid, - must_run_delayed_refs == 1); - } return err; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b401c4e36394..7d205e50b09c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1048,6 +1048,8 @@ TRACE_EVENT(btrfs_trigger_flush, { FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS"}, \ { FLUSH_DELALLOC, "FLUSH_DELALLOC"}, \ { FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT"}, \ + { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ + { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ { COMMIT_TRANS, "COMMIT_TRANS"})