diff mbox series

[3/7] bcache: make bch_btree_check() to be multithreaded

Message ID 20200322060305.70637-4-colyli@suse.de (mailing list archive)
State New, archived
Headers show
Series bcache patches for Linux v5.7-rc1 | expand

Commit Message

Coly Li March 22, 2020, 6:03 a.m. UTC
When registering a cache device, bch_btree_check() is called to check
all btree node, to make sure the btree is consistency and no corruption.

bch_btree_check() is recursively executed in single thread, when there
are a lot of data cached and the btree is huge, it may take very long
time to check all the btree nodes. In my testing, I observed it took
around 50 minutes to finish bch_btree_check().

When checking the bcache btree nodes, the cache set is not running yet,
and indeed the whole tree is in read-only state, it is safe to create
multiple threads to check the btree in parallel.

This patch tries to create multiple threads, and each thread tries to
one-by-one check the sub-tree indexed by a key from the btree root node.
The parallel thread number depends on how many keys in the btree root
node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in
practice is should be min(cpu-number/2, root-node-keys-number).

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
---
 drivers/md/bcache/btree.c | 169 +++++++++++++++++++++++++++++++++++++-
 drivers/md/bcache/btree.h |  22 +++++
 2 files changed, 188 insertions(+), 3 deletions(-)

Comments

Hannes Reinecke March 23, 2020, 7 a.m. UTC | #1
On 3/22/20 7:03 AM, Coly Li wrote:
> When registering a cache device, bch_btree_check() is called to check
> all btree node, to make sure the btree is consistency and no corruption.
> 
> bch_btree_check() is recursively executed in single thread, when there
> are a lot of data cached and the btree is huge, it may take very long
> time to check all the btree nodes. In my testing, I observed it took
> around 50 minutes to finish bch_btree_check().
> 
> When checking the bcache btree nodes, the cache set is not running yet,
> and indeed the whole tree is in read-only state, it is safe to create
> multiple threads to check the btree in parallel.
> 
> This patch tries to create multiple threads, and each thread tries to
> one-by-one check the sub-tree indexed by a key from the btree root node.
> The parallel thread number depends on how many keys in the btree root
> node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in
> practice is should be min(cpu-number/2, root-node-keys-number).
> 
> Signed-off-by: Coly Li <colyli@suse.de>
> Cc: Christoph Hellwig <hch@infradead.org>
> ---
>   drivers/md/bcache/btree.c | 169 +++++++++++++++++++++++++++++++++++++-
>   drivers/md/bcache/btree.h |  22 +++++
>   2 files changed, 188 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index faf152524a16..74d66b641169 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -1897,13 +1897,176 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
>   	return ret;
>   }
>   
> +
> +static int bch_btree_check_thread(void *arg)
> +{
> +	int ret;
> +	struct btree_check_info *info = arg;
> +	struct btree_check_state *check_state = info->state;
> +	struct cache_set *c = check_state->c;
> +	struct btree_iter iter;
> +	struct bkey *k, *p;
> +	int cur_idx, prev_idx, skip_nr;
> +	int i, n;
> +
> +	k = p = NULL;
> +	i = n = 0;
> +	cur_idx = prev_idx = 0;
> +	ret = 0;
> +
> +	/* root node keys are checked before thread created */
> +	bch_btree_iter_init(&c->root->keys, &iter, NULL);
> +	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
> +	BUG_ON(!k);
> +
> +	p = k;
> +	while (k) {
> +		/*
> +		 * Fetch a root node key index, skip the keys which
> +		 * should be fetched by other threads, then check the
> +		 * sub-tree indexed by the fetched key.
> +		 */
> +		spin_lock(&check_state->idx_lock);
> +		cur_idx = check_state->key_idx;
> +		check_state->key_idx++;
> +		spin_unlock(&check_state->idx_lock);
> +
> +		skip_nr = cur_idx - prev_idx;
> +
> +		while (skip_nr) {
> +			k = bch_btree_iter_next_filter(&iter,
> +						       &c->root->keys,
> +						       bch_ptr_bad);
> +			if (k)
> +				p = k;
> +			else {
> +				/*
> +				 * No more keys to check in root node,
> +				 * current checking threads are enough,
> +				 * stop creating more.
> +				 */
> +				atomic_set(&check_state->enough, 1);
> +				/* Update check_state->enough earlier */
> +				smp_mb();
> +				goto out;
> +			}
> +			skip_nr--;
> +			cond_resched();
> +		}
> +
> +		if (p) {
> +			struct btree_op op;
> +
> +			btree_node_prefetch(c->root, p);
> +			c->gc_stats.nodes++;
> +			bch_btree_op_init(&op, 0);
> +			ret = bcache_btree(check_recurse, p, c->root, &op);
> +			if (ret)
> +				goto out;
> +		}
> +		p = NULL;
> +		prev_idx = cur_idx;
> +		cond_resched();
> +	}
> +
> +out:
> +	info->result = ret;
> +	/* update check_state->started among all CPUs */
> +	smp_mb();
> +	if (atomic_dec_and_test(&check_state->started))
> +		wake_up(&check_state->wait);
> +
> +	return ret;
> +}
> +
> +
> +
> +static int bch_btree_chkthread_nr(void)
> +{
> +	int n = num_online_cpus()/2;
> +
> +	if (n == 0)
> +		n = 1;
> +	else if (n > BCH_BTR_CHKTHREAD_MAX)
> +		n = BCH_BTR_CHKTHREAD_MAX;
> +
> +	return n;
> +}
> +
>   int bch_btree_check(struct cache_set *c)
>   {
> -	struct btree_op op;
> +	int ret = 0;
> +	int i;
> +	struct bkey *k = NULL;
> +	struct btree_iter iter;
> +	struct btree_check_state *check_state;
> +	char name[32];
>   
> -	bch_btree_op_init(&op, SHRT_MAX);
> +	/* check and mark root node keys */
> +	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
> +		bch_initial_mark_key(c, c->root->level, k);
> +
> +	bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
> +
> +	if (c->root->level == 0)
> +		return 0;
> +
> +	check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
> +	if (!check_state)
> +		return -ENOMEM;
> +
> +	check_state->c = c;
> +	check_state->total_threads = bch_btree_chkthread_nr();
> +	check_state->key_idx = 0;
> +	spin_lock_init(&check_state->idx_lock);
> +	atomic_set(&check_state->started, 0);
> +	atomic_set(&check_state->enough, 0);
> +	init_waitqueue_head(&check_state->wait);
>   
> -	return bcache_btree_root(check_recurse, c, &op);
> +	/*
> +	 * Run multiple threads to check btree nodes in parallel,
> +	 * if check_state->enough is non-zero, it means current
> +	 * running check threads are enough, unncessary to create
> +	 * more.
> +	 */
> +	for (i = 0; i < check_state->total_threads; i++) {
> +		/* fetch latest check_state->enough earlier */
> +		smp_mb();
> +		if (atomic_read(&check_state->enough))
> +			break;
> +
> +		check_state->infos[i].result = 0;
> +		check_state->infos[i].state = check_state;
> +		snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
> +		atomic_inc(&check_state->started);
> +
> +		check_state->infos[i].thread =
> +			kthread_run(bch_btree_check_thread,
> +				    &check_state->infos[i],
> +				    name);
> +		if (IS_ERR(check_state->infos[i].thread)) {
> +			pr_err("fails to run thread bch_btrchk[%d]", i);
> +			for (--i; i >= 0; i--)
> +				kthread_stop(check_state->infos[i].thread);
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
> +	wait_event_interruptible(check_state->wait,
> +				 atomic_read(&check_state->started) == 0 ||
> +				  test_bit(CACHE_SET_IO_DISABLE, &c->flags));
> +
> +	for (i = 0; i < check_state->total_threads; i++) {
> +		if (check_state->infos[i].result) {
> +			ret = check_state->infos[i].result;
> +			goto out;
> +		}
> +	}
> +
> +out:
> +	kfree(check_state);
> +	return ret;
>   }
>   
>   void bch_initial_gc_finish(struct cache_set *c)
> diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
> index 19e30266070a..7c884f278da8 100644
> --- a/drivers/md/bcache/btree.h
> +++ b/drivers/md/bcache/btree.h
> @@ -145,6 +145,9 @@ struct btree {
>   	struct bio		*bio;
>   };
>   
> +
> +
> +
>   #define BTREE_FLAG(flag)						\
>   static inline bool btree_node_ ## flag(struct btree *b)			\
>   {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
> @@ -216,6 +219,25 @@ struct btree_op {
>   	unsigned int		insert_collision:1;
>   };
>   
> +struct btree_check_state;
> +struct btree_check_info {
> +	struct btree_check_state	*state;
> +	struct task_struct		*thread;
> +	int				result;
> +};
> +
> +#define BCH_BTR_CHKTHREAD_MAX	64
> +struct btree_check_state {
> +	struct cache_set		*c;
> +	int				total_threads;
> +	int				key_idx;
> +	spinlock_t			idx_lock;
> +	atomic_t			started;
> +	atomic_t			enough;
> +	wait_queue_head_t		wait;
> +	struct btree_check_info		infos[BCH_BTR_CHKTHREAD_MAX];
> +};
> +
>   static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
>   {
>   	memset(op, 0, sizeof(struct btree_op));
> 
Any particular reason why you are not using workqueues?
Kthreads is a bit of an overkill here I think...

Cheers,

Hannes
diff mbox series

Patch

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index faf152524a16..74d66b641169 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1897,13 +1897,176 @@  static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
 
+
+static int bch_btree_check_thread(void *arg)
+{
+	int ret;
+	struct btree_check_info *info = arg;
+	struct btree_check_state *check_state = info->state;
+	struct cache_set *c = check_state->c;
+	struct btree_iter iter;
+	struct bkey *k, *p;
+	int cur_idx, prev_idx, skip_nr;
+	int i, n;
+
+	k = p = NULL;
+	i = n = 0;
+	cur_idx = prev_idx = 0;
+	ret = 0;
+
+	/* root node keys are checked before thread created */
+	bch_btree_iter_init(&c->root->keys, &iter, NULL);
+	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+	BUG_ON(!k);
+
+	p = k;
+	while (k) {
+		/*
+		 * Fetch a root node key index, skip the keys which
+		 * should be fetched by other threads, then check the
+		 * sub-tree indexed by the fetched key.
+		 */
+		spin_lock(&check_state->idx_lock);
+		cur_idx = check_state->key_idx;
+		check_state->key_idx++;
+		spin_unlock(&check_state->idx_lock);
+
+		skip_nr = cur_idx - prev_idx;
+
+		while (skip_nr) {
+			k = bch_btree_iter_next_filter(&iter,
+						       &c->root->keys,
+						       bch_ptr_bad);
+			if (k)
+				p = k;
+			else {
+				/*
+				 * No more keys to check in root node,
+				 * current checking threads are enough,
+				 * stop creating more.
+				 */
+				atomic_set(&check_state->enough, 1);
+				/* Update check_state->enough earlier */
+				smp_mb();
+				goto out;
+			}
+			skip_nr--;
+			cond_resched();
+		}
+
+		if (p) {
+			struct btree_op op;
+
+			btree_node_prefetch(c->root, p);
+			c->gc_stats.nodes++;
+			bch_btree_op_init(&op, 0);
+			ret = bcache_btree(check_recurse, p, c->root, &op);
+			if (ret)
+				goto out;
+		}
+		p = NULL;
+		prev_idx = cur_idx;
+		cond_resched();
+	}
+
+out:
+	info->result = ret;
+	/* update check_state->started among all CPUs */
+	smp_mb();
+	if (atomic_dec_and_test(&check_state->started))
+		wake_up(&check_state->wait);
+
+	return ret;
+}
+
+
+
+static int bch_btree_chkthread_nr(void)
+{
+	int n = num_online_cpus()/2;
+
+	if (n == 0)
+		n = 1;
+	else if (n > BCH_BTR_CHKTHREAD_MAX)
+		n = BCH_BTR_CHKTHREAD_MAX;
+
+	return n;
+}
+
 int bch_btree_check(struct cache_set *c)
 {
-	struct btree_op op;
+	int ret = 0;
+	int i;
+	struct bkey *k = NULL;
+	struct btree_iter iter;
+	struct btree_check_state *check_state;
+	char name[32];
 
-	bch_btree_op_init(&op, SHRT_MAX);
+	/* check and mark root node keys */
+	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+		bch_initial_mark_key(c, c->root->level, k);
+
+	bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
+
+	if (c->root->level == 0)
+		return 0;
+
+	check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
+	if (!check_state)
+		return -ENOMEM;
+
+	check_state->c = c;
+	check_state->total_threads = bch_btree_chkthread_nr();
+	check_state->key_idx = 0;
+	spin_lock_init(&check_state->idx_lock);
+	atomic_set(&check_state->started, 0);
+	atomic_set(&check_state->enough, 0);
+	init_waitqueue_head(&check_state->wait);
 
-	return bcache_btree_root(check_recurse, c, &op);
+	/*
+	 * Run multiple threads to check btree nodes in parallel,
+	 * if check_state->enough is non-zero, it means current
+	 * running check threads are enough, unncessary to create
+	 * more.
+	 */
+	for (i = 0; i < check_state->total_threads; i++) {
+		/* fetch latest check_state->enough earlier */
+		smp_mb();
+		if (atomic_read(&check_state->enough))
+			break;
+
+		check_state->infos[i].result = 0;
+		check_state->infos[i].state = check_state;
+		snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
+		atomic_inc(&check_state->started);
+
+		check_state->infos[i].thread =
+			kthread_run(bch_btree_check_thread,
+				    &check_state->infos[i],
+				    name);
+		if (IS_ERR(check_state->infos[i].thread)) {
+			pr_err("fails to run thread bch_btrchk[%d]", i);
+			for (--i; i >= 0; i--)
+				kthread_stop(check_state->infos[i].thread);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	wait_event_interruptible(check_state->wait,
+				 atomic_read(&check_state->started) == 0 ||
+				  test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+	for (i = 0; i < check_state->total_threads; i++) {
+		if (check_state->infos[i].result) {
+			ret = check_state->infos[i].result;
+			goto out;
+		}
+	}
+
+out:
+	kfree(check_state);
+	return ret;
 }
 
 void bch_initial_gc_finish(struct cache_set *c)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 19e30266070a..7c884f278da8 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -145,6 +145,9 @@  struct btree {
 	struct bio		*bio;
 };
 
+
+
+
 #define BTREE_FLAG(flag)						\
 static inline bool btree_node_ ## flag(struct btree *b)			\
 {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
@@ -216,6 +219,25 @@  struct btree_op {
 	unsigned int		insert_collision:1;
 };
 
+struct btree_check_state;
+struct btree_check_info {
+	struct btree_check_state	*state;
+	struct task_struct		*thread;
+	int				result;
+};
+
+#define BCH_BTR_CHKTHREAD_MAX	64
+struct btree_check_state {
+	struct cache_set		*c;
+	int				total_threads;
+	int				key_idx;
+	spinlock_t			idx_lock;
+	atomic_t			started;
+	atomic_t			enough;
+	wait_queue_head_t		wait;
+	struct btree_check_info		infos[BCH_BTR_CHKTHREAD_MAX];
+};
+
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
 	memset(op, 0, sizeof(struct btree_op));