make bch_btree_check() to be multiple threads

mainline inclusion from mainline-v5.7-rc1 commit 8e710227 category: performance bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA When registering a cache device, bch_btree_check() is called to check all btree nodes, to make sure the btree is consistent and not corrupted. bch_btree_check() is recursively executed in a single thread, when there are a lot of data cached and the btree is huge, it may take very long time to check all the btree nodes. In my testing, I observed it took around 50 minutes to finish bch_btree_check(). When checking the bcache btree nodes, the cache set is not running yet, and indeed the whole tree is in read-only state, it is safe to create multiple threads to check the btree in parallel. This patch tries to create multiple threads, and each thread tries to one-by-one check the sub-tree indexed by a key from the btree root node. The parallel thread number depends on how many keys in the btree root node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in practice is should be min(cpu-number/2, root-node-keys-number). Signed-off-by: N Coly Li <colyli@suse.de> Signed-off-by: N Jens Axboe <axboe@kernel.dk> Signed-off-by: N qinghaixiang <xuweiqhx@163.com> Signed-off-by: N Xu Wei <xuwei56@huawei.com> Acked-by: N Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: N Li Ruilin <liruilin4@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>

make bch_btree_check() to be multiple threads
mainline inclusion from mainline-v5.7-rc1 commit 8e710227 category: performance bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA When registering a cache device, bch_btree_check() is called to check all btree nodes, to make sure the btree is consistent and not corrupted. bch_btree_check() is recursively executed in a single thread, when there are a lot of data cached and the btree is huge, it may take very long time to check all the btree nodes. In my testing, I observed it took around 50 minutes to finish bch_btree_check(). When checking the bcache btree nodes, the cache set is not running yet, and indeed the whole tree is in read-only state, it is safe to create multiple threads to check the btree in parallel. This patch tries to create multiple threads, and each thread tries to one-by-one check the sub-tree indexed by a key from the btree root node. The parallel thread number depends on how many keys in the btree root node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in practice is should be min(cpu-number/2, root-node-keys-number). Signed-off-by: N Coly Li <colyli@suse.de> Signed-off-by: N Jens Axboe <axboe@kernel.dk> Signed-off-by: N qinghaixiang <xuweiqhx@163.com> Signed-off-by: N Xu Wei <xuwei56@huawei.com> Acked-by: N Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: N Li Ruilin <liruilin4@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>
b4a620ff · Coly Li · Yang Yingliang · b278db77 · b4a620ff · b4a620ff
隐藏空白更改
内联并排

Showing with 246 addition and 63 deletion

drivers/md/bcache/btree.c drivers/md/bcache/btree.c +161 -63

drivers/md/bcache/btree.h drivers/md/bcache/btree.h +85 -0

未找到文件。
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -101,65 +101,6 @@
 #define insert_lock(s, b)	((b)->level <= (s)->lock)
-/*
- * These macros are for recursing down the btree - they handle the details of
- * locking and looking up nodes in the cache for you. They're best treated as
- * mere syntax when reading code that uses them.
- *
- * op->lock determines whether we take a read or a write lock at a given depth.
- * If you've got a read lock and find that you need a write lock (i.e. you're
- * going to have to split), set op->lock and return -EINTR; btree_root() will
- * call you again and you'll have the correct lock.
- */
-/**
- * btree - recurse down the btree on a specified key
- * @fn:		function to call, which will be passed the child node
- * @key:	key to recurse on
- * @b:		parent btree node
- * @op:		pointer to struct btree_op
- */
-#define btree(fn, key, b, op, ...)					\
-({									\
-	int _r, l = (b)->level - 1;					\
-	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
-						  _w, b);		\
-	if (!IS_ERR(_child)) {						\
-		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
-		rw_unlock(_w, _child);					\
-	} else								\
-		_r = PTR_ERR(_child);					\
-	_r;								\
-})
-/**
- * btree_root - call a function on the root of the btree
- * @fn:		function to call, which will be passed the child node
- * @c:		cache set
- * @op:		pointer to struct btree_op
- */
-#define btree_root(fn, c, op, ...)					\
-({									\
-	int _r = -EINTR;						\
-	do {								\
-		struct btree *_b = (c)->root;				\
-		bool _w = insert_lock(op, _b);				\
-		rw_lock(_w, _b, _b->level);				\
-		if (_b == (c)->root &&					\
-		    _w == insert_lock(op, _b)) {			\
-			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
-		}							\
-		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c);				\
-		if (_r == -EINTR)					\
-			schedule();					\
-	} while (_r == -EINTR);						\
-									\
-	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
-	_r;								\
-})
 static inline struct bset *write_block(struct btree *b)
 {
 	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
@@ -1949,13 +1890,170 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
+static int bch_btree_check_thread(void *arg)
+{
+	int ret;
+	struct btree_check_info *info = arg;
+	struct btree_check_state *check_state = info->state;
+	struct cache_set *c = check_state->c;
+	struct btree_iter iter;
+	struct bkey *k, *p;
+	int cur_idx, prev_idx, skip_nr;
+	k = p = NULL;
+	cur_idx = prev_idx = 0;
+	ret = 0;
+	/* root node keys are checked before thread created */
+	bch_btree_iter_init(&c->root->keys, &iter, NULL);
+	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+	WARN_ON(!k);
+	p = k;
+	while (k) {
+		/*
+		 * Fetch a root node key index, skip the keys which
+		 * should be fetched by other threads, then check the
+		 * sub-tree indexed by the fetched key.
+		 */
+		spin_lock(&check_state->idx_lock);
+		cur_idx = check_state->key_idx;
+		check_state->key_idx++;
+		spin_unlock(&check_state->idx_lock);
+		skip_nr = cur_idx - prev_idx;
+		while (skip_nr) {
+			k = bch_btree_iter_next_filter(&iter,
+				&c->root->keys,
+				bch_ptr_bad);
+			if (k)
+				p = k;
+			else {
+				/*
+				 * No more keys to check in root node,
+				 * current checking threads are enough,
+				 * stop creating more.
+				 */
+				atomic_set(&check_state->enough, 1);
+				/* Update check_state->enough earlier */
+				smp_mb();
+				goto out;
+			}
+			skip_nr--;
+			cond_resched();
+		}
+		if (p) {
+			struct btree_op op;
+			btree_node_prefetch(c->root, p);
+			c->gc_stats.nodes++;
+			bch_btree_op_init(&op, 0);
+			ret = btree(check_recurse, p, c->root, &op);
+			if (ret)
+				goto out;
+		}
+		p = NULL;
+		prev_idx = cur_idx;
+		cond_resched();
+	}
+out:
+	info->result = ret;
+	/* update check_state->started among all CPUs */
+	smp_mb();
+	if (atomic_dec_and_test(&check_state->started))
+		wake_up(&check_state->wait);
+	return ret;
+}
+static int bch_btree_chkthread_nr(void)
+{
+	int n = num_online_cpus() / 2;
+	if (n == 0)
+		n = 1;
+	else if (n > BCH_BTR_CHKTHREAD_MAX)
+		n = BCH_BTR_CHKTHREAD_MAX;
+	return n;
+}
 int bch_btree_check(struct cache_set *c)
 {
-	struct btree_op op;
+	int ret = 0;
+	int i;
+	struct bkey *k = NULL;
+	struct btree_iter iter;
+	struct btree_check_state *check_state;
+	char name[32];
-	bch_btree_op_init(&op, SHRT_MAX);
+	/* check and mark root node keys */
+	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+		bch_initial_mark_key(c, c->root->level, k);
+	bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
+	if (c->root->level == 0)
+		return 0;
+	check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
+	if (!check_state)
+		return -ENOMEM;
-	return btree_root(check_recurse, c, &op);
+	check_state->c = c;
+	check_state->total_threads = bch_btree_chkthread_nr();
+	check_state->key_idx = 0;
+	spin_lock_init(&check_state->idx_lock);
+	atomic_set(&check_state->started, 0);
+	atomic_set(&check_state->enough, 0);
+	init_waitqueue_head(&check_state->wait);
+	/*
+	 * Run multiple threads to check btree nodes in parallel,
+	 * if check_state->enough is non-zero, it means current
+	 * running check threads are enough, unncessary to create
+	 * more.
+	 */
+	for (i = 0; i < check_state->total_threads; i++) {
+		/* fetch latest check_state->enough earlier */
+		smp_mb();
+		if (atomic_read(&check_state->enough))
+			break;
+		check_state->infos[i].result = 0;
+		check_state->infos[i].state = check_state;
+		snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
+		atomic_inc(&check_state->started);
+		check_state->infos[i].thread =
+			kthread_run(bch_btree_check_thread,
+				&check_state->infos[i],
+				name);
+		if (IS_ERR(check_state->infos[i].thread)) {
+			pr_err("fails to run thread bch_btrchk[%d]\n", i);
+			for (--i; i >= 0; i--)
+				kthread_stop(check_state->infos[i].thread);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	wait_event_interruptible(check_state->wait,
+		atomic_read(&check_state->started) == 0);
+	for (i = 0; i < check_state->total_threads; i++) {
+		if (check_state->infos[i].result) {
+			ret = check_state->infos[i].result;
+			goto out;
+		}
+	}
+out:
+	kfree(check_state);
+	return ret;
 }
 void bch_initial_gc_finish(struct cache_set *c)
@@ -2416,7 +2514,7 @@ int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
 	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
 }
-static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
 				      struct bkey *from, btree_map_keys_fn *fn,
 				      int flags)
 {

--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -222,6 +222,25 @@ struct btree_op {
 	unsigned int		insert_collision:1;
 };
+struct btree_check_state;
+struct btree_check_info {
+	struct btree_check_state *state;
+	struct task_struct *thread;
+	int result;
+};
+#define BCH_BTR_CHKTHREAD_MAX	64
+struct btree_check_state {
+	struct cache_set *c;
+	int total_threads;
+	int key_idx;
+	spinlock_t idx_lock;
+	atomic_t started;
+	atomic_t enough;
+	wait_queue_head_t wait;
+	struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX];
+};
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
 	memset(op, 0, sizeof(struct btree_op));
@@ -266,12 +285,78 @@ void bch_initial_gc_finish(struct cache_set *c);
 void bch_moving_gc(struct cache_set *c, bool only_move_dirty);
 int bch_btree_check(struct cache_set *c);
 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
+typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b,
+			struct bkey *k);
+int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
+			struct bkey *from, btree_map_keys_fn *fn, int flags);
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+			struct bkey *from, btree_map_keys_fn *fn,
+			int flags);
 static inline void wake_up_gc(struct cache_set *c)
 {
 	wake_up(&c->gc_wait);
 }
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:		function to call, which will be passed the child node
+ * @key:	key to recurse on
+ * @b:		parent btree node
+ * @op:		pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)					\
+({									\
+	int _r, l = (b)->level - 1;					\
+	bool _w = l <= (op)->lock;					\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
+							_w, b);		\
+	if (!IS_ERR(_child)) {						\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
+	} else								\
+		_r = PTR_ERR(_child);					\
+	_r;								\
+})
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:		function to call, which will be passed the child node
+ * @c:		cache set
+ * @op:		pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)					\
+({									\
+	int _r = -EINTR;						\
+	do {								\
+		struct btree *_b = (c)->root;				\
+		bool _w = insert_lock(op, _b);				\
+		rw_lock(_w, _b, _b->level);				\
+		if (_b == (c)->root &&					\
+		    _w == insert_lock(op, _b)) {			\
+			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
+		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
+		if (_r == -EINTR)					\
+			schedule();					\
+	} while (_r == -EINTR);						\
+									\
+	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
+	_r;								\
+})
 #define MAP_DONE	0
 #define MAP_CONTINUE	1