@@ -424,7 +424,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
list_del(&ref->add_list);
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
- atomic_dec(&delayed_refs->num_entries);
+ btrfs_dec_delayed_ref_entries(delayed_refs);
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -580,7 +580,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
+ btrfs_dec_delayed_ref_entries(delayed_refs);
delayed_refs->num_heads--;
if (head->processing == 0)
delayed_refs->num_heads_ready--;
@@ -639,6 +639,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
+ trans->total_delayed_refs++;
spin_unlock(&href->lock);
return ret;
}
@@ -843,6 +844,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
+ trans->total_delayed_refs++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -150,6 +150,13 @@ struct btrfs_delayed_ref_root {
*/
atomic_t num_entries;
+ /*
+ * How many entries we've run, and a corresponding waitqueue so that we
+ * can throttle generators appropriately.
+ */
+ atomic_t entries_run;
+ wait_queue_head_t wait;
+
/* total number of head nodes in tree */
unsigned long num_heads;
@@ -391,4 +398,11 @@ btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
return container_of(node, struct btrfs_delayed_data_ref, node);
}
+static inline void
+btrfs_dec_delayed_ref_entries(struct btrfs_delayed_ref_root *delayed_refs)
+{
+ atomic_dec(&delayed_refs->num_entries);
+ atomic_inc(&delayed_refs->entries_run);
+ wake_up(&delayed_refs->wait);
+}
#endif
@@ -1958,7 +1958,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
default:
WARN_ON(1);
}
- atomic_dec(&delayed_refs->num_entries);
/*
* Record the must_insert_reserved flag before we drop the
@@ -1974,6 +1973,9 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, ref, extent_op,
must_insert_reserved);
+ /* Anybody who's been throttled may be woken up here. */
+ btrfs_dec_delayed_ref_entries(delayed_refs);
+
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref);
@@ -307,6 +307,8 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
+ atomic_set(&cur_trans->delayed_refs.entries_run, 0);
+ init_waitqueue_head(&cur_trans->delayed_refs.wait);
/*
* although the tree mod log is per file system and not per transaction,
@@ -893,13 +895,29 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
trans->bytes_reserved = 0;
}
+static void noinline
+btrfs_throttle_for_delayed_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ unsigned long refs, bool throttle)
+{
+ unsigned long threshold = max(refs, 1UL) +
+ atomic_read(&delayed_refs->entries_run);
+ wait_event_interruptible(delayed_refs->wait,
+ (atomic_read(&delayed_refs->entries_run) >= threshold) ||
+ !btrfs_should_throttle_delayed_refs(fs_info, delayed_refs,
+ throttle));
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int throttle)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
+ unsigned long total_delayed_refs;
+ unsigned int trans_type = trans->type;
int err = 0;
bool run_async = false;
+ bool throttle_delayed_refs = false;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -907,9 +925,23 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
+ /*
+ * If we are over our threshold for our specified throttle then we need
+ * to throttle ourselves, because the async flusher is not keeping up.
+ *
+ * However if we're just over the async threshold simply kick the async
+ * flusher.
+ */
if (btrfs_should_throttle_delayed_refs(info,
- &cur_trans->delayed_refs, true))
+ &cur_trans->delayed_refs,
+ throttle)) {
run_async = true;
+ throttle_delayed_refs = true;
+ } else if (btrfs_should_throttle_delayed_refs(info,
+ &cur_trans->delayed_refs,
+ true)) {
+ run_async = true;
+ }
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
@@ -918,7 +950,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (trans->type & __TRANS_FREEZABLE)
+ if (trans_type & __TRANS_FREEZABLE)
sb_end_intwrite(info->sb);
WARN_ON(cur_trans != info->running_transaction);
@@ -927,7 +959,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait);
- btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -935,6 +966,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
+ total_delayed_refs = trans->total_delayed_refs;
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
@@ -946,6 +978,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
&info->async_delayed_ref_work);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ /*
+ * We only want to throttle generators, so btrfs_transaction_start
+ * callers.
+ */
+ if (throttle_delayed_refs && total_delayed_refs &&
+ (trans_type & __TRANS_START))
+ btrfs_throttle_for_delayed_refs(info, &cur_trans->delayed_refs,
+ total_delayed_refs, throttle);
+ btrfs_put_transaction(cur_trans);
+
return err;
}
@@ -109,7 +109,23 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
+
+ /*
+ * This tracks the number of items required for the delayed ref rsv, and
+ * is used by that code. The accounting is
+ *
+ * - 1 per delayed ref head (individual items are not counted).
+ * - number of csum items that would be inserted for data.
+ * - block group item updates.
+ */
unsigned long delayed_ref_updates;
+
+ /*
+ * This is the total number of delayed items that we added for this
+ * trans handle, this is used for the end transaction throttling code.
+ */
+ unsigned long total_delayed_refs;
+
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
We need to make sure we don't generate so many delayed refs that the box gets overwhelmed at commit time. Keep a sequence number of the number of entries run, and if we need to be throttled based on our time constraints wait for the number of delayed refs we added to be run once we end our transaction. Signed-off-by: Josef Bacik <josef@toxicpanda.com> --- fs/btrfs/delayed-ref.c | 6 ++++-- fs/btrfs/delayed-ref.h | 14 ++++++++++++ fs/btrfs/extent-tree.c | 4 +++- fs/btrfs/transaction.c | 49 +++++++++++++++++++++++++++++++++++++++--- fs/btrfs/transaction.h | 16 ++++++++++++++ 5 files changed, 83 insertions(+), 6 deletions(-)