btree.c 58.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
K
Kent Overstreet 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
 *
 * Uses a block device as cache for other block devices; optimized for SSDs.
 * All allocation is done in buckets, which should match the erase block size
 * of the device.
 *
 * Buckets containing cached data are kept on a heap sorted by priority;
 * bucket priority is increased on cache hit, and periodically all the buckets
 * on the heap have their priority scaled down. This currently is just used as
 * an LRU but in the future should allow for more intelligent heuristics.
 *
 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
 * counter. Garbage collection is used to remove stale pointers.
 *
 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
 * as keys are inserted we only sort the pages that have not yet been written.
 * When garbage collection is run, we resort the entire node.
 *
21
 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
K
Kent Overstreet 已提交
22 23 24 25 26
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
27
#include "extents.h"
K
Kent Overstreet 已提交
28 29 30 31

#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/hash.h>
K
Kent Overstreet 已提交
32
#include <linux/kthread.h>
33
#include <linux/prefetch.h>
K
Kent Overstreet 已提交
34 35
#include <linux/random.h>
#include <linux/rcupdate.h>
36
#include <linux/sched/clock.h>
37 38
#include <linux/rculist.h>

K
Kent Overstreet 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
#include <trace/events/bcache.h>

/*
 * Todo:
 * register_bcache: Return errors out to userspace correctly
 *
 * Writeback: don't undirty key until after a cache flush
 *
 * Create an iterator for key pointers
 *
 * On btree write error, mark bucket such that it won't be freed from the cache
 *
 * Journalling:
 *   Check for bad keys in replay
 *   Propagate barriers
 *   Refcount journal entries in journal_replay
 *
 * Garbage collection:
 *   Finish incremental gc
 *   Gc should free old UUIDs, data for invalid UUIDs
 *
 * Provide a way to list backing device UUIDs we have data cached for, and
 * probably how long it's been since we've seen them, and a way to invalidate
 * dirty data for devices that will never be attached again
 *
 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
 * that based on that and how much dirty data we have we can keep writeback
 * from being starved
 *
 * Add a tracepoint or somesuch to watch for writeback starvation
 *
 * When btree depth > 1 and splitting an interior node, we have to make sure
 * alloc_bucket() cannot fail. This should be true but is not completely
 * obvious.
 *
 * Plugging?
 *
 * If data write is less than hard sector size of ssd, round up offset in open
 * bucket to the next whole sector
 *
 * Superblock needs to be fleshed out for multiple cache devices
 *
 * Add a sysfs tunable for the number of writeback IOs in flight
 *
 * Add a sysfs tunable for the number of open data buckets
 *
 * IO tracking: Can we track when one process is doing io on behalf of another?
 * IO tracking: Don't use just an average, weigh more recent stuff higher
 *
 * Test module load/unload
 */

#define MAX_NEED_GC		64
#define MAX_SAVE_PRIO		72
93
#define MAX_GC_TIMES		100
T
Tang Junhui 已提交
94 95
#define MIN_GC_NODES		100
#define GC_SLEEP_MS		100
K
Kent Overstreet 已提交
96 97 98 99 100 101

#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))

#define PTR_HASH(c, k)							\
	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
#define insert_lock(s, b)	((b)->level <= (s)->lock)

/*
 * These macros are for recursing down the btree - they handle the details of
 * locking and looking up nodes in the cache for you. They're best treated as
 * mere syntax when reading code that uses them.
 *
 * op->lock determines whether we take a read or a write lock at a given depth.
 * If you've got a read lock and find that you need a write lock (i.e. you're
 * going to have to split), set op->lock and return -EINTR; btree_root() will
 * call you again and you'll have the correct lock.
 */

/**
 * btree - recurse down the btree on a specified key
 * @fn:		function to call, which will be passed the child node
 * @key:	key to recurse on
 * @b:		parent btree node
 * @op:		pointer to struct btree_op
 */
#define btree(fn, key, b, op, ...)					\
({									\
	int _r, l = (b)->level - 1;					\
	bool _w = l <= (op)->lock;					\
126 127
	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
						  _w, b);		\
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
	if (!IS_ERR(_child)) {						\
		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
		rw_unlock(_w, _child);					\
	} else								\
		_r = PTR_ERR(_child);					\
	_r;								\
})

/**
 * btree_root - call a function on the root of the btree
 * @fn:		function to call, which will be passed the child node
 * @c:		cache set
 * @op:		pointer to struct btree_op
 */
#define btree_root(fn, c, op, ...)					\
({									\
	int _r = -EINTR;						\
	do {								\
		struct btree *_b = (c)->root;				\
		bool _w = insert_lock(op, _b);				\
		rw_lock(_w, _b, _b->level);				\
		if (_b == (c)->root &&					\
		    _w == insert_lock(op, _b)) {			\
			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
		}							\
		rw_unlock(_w, _b);					\
154
		bch_cannibalize_unlock(c);				\
155 156
		if (_r == -EINTR)					\
			schedule();					\
157 158
	} while (_r == -EINTR);						\
									\
159
	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
160 161 162
	_r;								\
})

K
Kent Overstreet 已提交
163 164 165 166 167
static inline struct bset *write_block(struct btree *b)
{
	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
}

K
Kent Overstreet 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181
static void bch_btree_init_next(struct btree *b)
{
	/* If not a leaf node, always sort */
	if (b->level && b->keys.nsets)
		bch_btree_sort(&b->keys, &b->c->sort);
	else
		bch_btree_sort_lazy(&b->keys, &b->c->sort);

	if (b->written < btree_blocks(b))
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));

}

K
Kent Overstreet 已提交
182 183
/* Btree key manipulation */

184
void bkey_put(struct cache_set *c, struct bkey *k)
185
{
186
	unsigned int i;
187 188 189 190 191 192

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i))
			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}

K
Kent Overstreet 已提交
193 194 195 196 197
/* Btree IO */

static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{
	uint64_t crc = b->key.ptr[0];
K
Kent Overstreet 已提交
198
	void *data = (void *) i + 8, *end = bset_bkey_last(i);
K
Kent Overstreet 已提交
199

200
	crc = bch_crc64_update(crc, data, end - data);
K
Kent Overstreet 已提交
201
	return crc ^ 0xffffffffffffffffULL;
K
Kent Overstreet 已提交
202 203
}

204
void bch_btree_node_read_done(struct btree *b)
K
Kent Overstreet 已提交
205 206
{
	const char *err = "bad btree header";
207
	struct bset *i = btree_bset_first(b);
K
Kent Overstreet 已提交
208
	struct btree_iter *iter;
K
Kent Overstreet 已提交
209

210
	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
K
Kent Overstreet 已提交
211
	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
K
Kent Overstreet 已提交
212 213
	iter->used = 0;

K
Kent Overstreet 已提交
214
#ifdef CONFIG_BCACHE_DEBUG
215
	iter->b = &b->keys;
K
Kent Overstreet 已提交
216 217
#endif

K
Kent Overstreet 已提交
218
	if (!i->seq)
K
Kent Overstreet 已提交
219 220 221
		goto err;

	for (;
K
Kent Overstreet 已提交
222
	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
K
Kent Overstreet 已提交
223 224 225 226 227 228
	     i = write_block(b)) {
		err = "unsupported bset version";
		if (i->version > BCACHE_BSET_VERSION)
			goto err;

		err = "bad btree header";
229 230
		if (b->written + set_blocks(i, block_bytes(b->c)) >
		    btree_blocks(b))
K
Kent Overstreet 已提交
231 232 233
			goto err;

		err = "bad magic";
234
		if (i->magic != bset_magic(&b->c->sb))
K
Kent Overstreet 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
			goto err;

		err = "bad checksum";
		switch (i->version) {
		case 0:
			if (i->csum != csum_set(i))
				goto err;
			break;
		case BCACHE_BSET_VERSION:
			if (i->csum != btree_csum_set(b, i))
				goto err;
			break;
		}

		err = "empty set";
K
Kent Overstreet 已提交
250
		if (i != b->keys.set[0].data && !i->keys)
K
Kent Overstreet 已提交
251 252
			goto err;

K
Kent Overstreet 已提交
253
		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
K
Kent Overstreet 已提交
254

255
		b->written += set_blocks(i, block_bytes(b->c));
K
Kent Overstreet 已提交
256 257 258 259
	}

	err = "corrupted btree";
	for (i = write_block(b);
K
Kent Overstreet 已提交
260
	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
K
Kent Overstreet 已提交
261
	     i = ((void *) i) + block_bytes(b->c))
K
Kent Overstreet 已提交
262
		if (i->seq == b->keys.set[0].data->seq)
K
Kent Overstreet 已提交
263 264
			goto err;

K
Kent Overstreet 已提交
265
	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
K
Kent Overstreet 已提交
266

K
Kent Overstreet 已提交
267
	i = b->keys.set[0].data;
K
Kent Overstreet 已提交
268
	err = "short btree key";
K
Kent Overstreet 已提交
269 270
	if (b->keys.set[0].size &&
	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
K
Kent Overstreet 已提交
271 272 273
		goto err;

	if (b->written < btree_blocks(b))
K
Kent Overstreet 已提交
274 275
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));
K
Kent Overstreet 已提交
276
out:
277
	mempool_free(iter, &b->c->fill_iter);
K
Kent Overstreet 已提交
278
	return;
K
Kent Overstreet 已提交
279 280
err:
	set_btree_node_io_error(b);
K
Kent Overstreet 已提交
281
	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
K
Kent Overstreet 已提交
282
			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
K
Kent Overstreet 已提交
283
			    bset_block_offset(b, i), i->keys);
K
Kent Overstreet 已提交
284 285 286
	goto out;
}

287
static void btree_node_read_endio(struct bio *bio)
K
Kent Overstreet 已提交
288
{
K
Kent Overstreet 已提交
289
	struct closure *cl = bio->bi_private;
290

K
Kent Overstreet 已提交
291 292
	closure_put(cl);
}
K
Kent Overstreet 已提交
293

294
static void bch_btree_node_read(struct btree *b)
K
Kent Overstreet 已提交
295 296 297 298
{
	uint64_t start_time = local_clock();
	struct closure cl;
	struct bio *bio;
K
Kent Overstreet 已提交
299

K
Kent Overstreet 已提交
300 301
	trace_bcache_btree_read(b);

K
Kent Overstreet 已提交
302
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
303

K
Kent Overstreet 已提交
304
	bio = bch_bbio_alloc(b->c);
305
	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
K
Kent Overstreet 已提交
306 307
	bio->bi_end_io	= btree_node_read_endio;
	bio->bi_private	= &cl;
308
	bio->bi_opf = REQ_OP_READ | REQ_META;
K
Kent Overstreet 已提交
309

K
Kent Overstreet 已提交
310
	bch_bio_map(bio, b->keys.set[0].data);
K
Kent Overstreet 已提交
311

K
Kent Overstreet 已提交
312 313
	bch_submit_bbio(bio, b->c, &b->key, 0);
	closure_sync(&cl);
K
Kent Overstreet 已提交
314

315
	if (bio->bi_status)
K
Kent Overstreet 已提交
316 317 318 319 320 321 322 323 324 325 326 327
		set_btree_node_io_error(b);

	bch_bbio_free(bio, b->c);

	if (btree_node_io_error(b))
		goto err;

	bch_btree_node_read_done(b);
	bch_time_stats_update(&b->c->btree_read_time, start_time);

	return;
err:
328
	bch_cache_set_error(b->c, "io error reading bucket %zu",
K
Kent Overstreet 已提交
329
			    PTR_BUCKET_NR(b->c, &b->key, 0));
K
Kent Overstreet 已提交
330 331 332 333 334 335
}

static void btree_complete_write(struct btree *b, struct btree_write *w)
{
	if (w->prio_blocked &&
	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
336
		wake_up_allocators(b->c);
K
Kent Overstreet 已提交
337 338 339 340 341 342 343 344 345 346

	if (w->journal) {
		atomic_dec_bug(w->journal);
		__closure_wake_up(&b->c->journal.wait);
	}

	w->prio_blocked	= 0;
	w->journal	= NULL;
}

347 348 349 350 351 352 353
static void btree_node_write_unlock(struct closure *cl)
{
	struct btree *b = container_of(cl, struct btree, io);

	up(&b->io_mutex);
}

K
Kent Overstreet 已提交
354
static void __btree_node_write_done(struct closure *cl)
K
Kent Overstreet 已提交
355
{
356
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
357 358 359 360 361 362 363
	struct btree_write *w = btree_prev_write(b);

	bch_bbio_free(b->bio, b->c);
	b->bio = NULL;
	btree_complete_write(b, w);

	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
364
		schedule_delayed_work(&b->work, 30 * HZ);
K
Kent Overstreet 已提交
365

366
	closure_return_with_destructor(cl, btree_node_write_unlock);
K
Kent Overstreet 已提交
367 368
}

K
Kent Overstreet 已提交
369
static void btree_node_write_done(struct closure *cl)
K
Kent Overstreet 已提交
370
{
371
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
372

373
	bio_free_pages(b->bio);
K
Kent Overstreet 已提交
374
	__btree_node_write_done(cl);
K
Kent Overstreet 已提交
375 376
}

377
static void btree_node_write_endio(struct bio *bio)
K
Kent Overstreet 已提交
378 379
{
	struct closure *cl = bio->bi_private;
380
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
381

382
	if (bio->bi_status)
K
Kent Overstreet 已提交
383 384
		set_btree_node_io_error(b);

385
	bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
K
Kent Overstreet 已提交
386 387 388 389
	closure_put(cl);
}

static void do_btree_node_write(struct btree *b)
K
Kent Overstreet 已提交
390
{
391
	struct closure *cl = &b->io;
392
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
393 394 395 396 397
	BKEY_PADDED(key) k;

	i->version	= BCACHE_BSET_VERSION;
	i->csum		= btree_csum_set(b, i);

K
Kent Overstreet 已提交
398 399 400 401
	BUG_ON(b->bio);
	b->bio = bch_bbio_alloc(b->c);

	b->bio->bi_end_io	= btree_node_write_endio;
K
Kent Overstreet 已提交
402
	b->bio->bi_private	= cl;
403
	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
404
	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
405
	bch_bio_map(b->bio, i);
K
Kent Overstreet 已提交
406

K
Kent Overstreet 已提交
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
	/*
	 * If we're appending to a leaf node, we don't technically need FUA -
	 * this write just needs to be persisted before the next journal write,
	 * which will be marked FLUSH|FUA.
	 *
	 * Similarly if we're writing a new btree root - the pointer is going to
	 * be in the next journal entry.
	 *
	 * But if we're writing a new btree node (that isn't a root) or
	 * appending to a non leaf btree node, we need either FUA or a flush
	 * when we write the parent with the new pointer. FUA is cheaper than a
	 * flush, and writes appending to leaf nodes aren't blocking anything so
	 * just make all btree node writes FUA to keep things sane.
	 */

K
Kent Overstreet 已提交
422
	bkey_copy(&k.key, &b->key);
423
	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
K
Kent Overstreet 已提交
424
		       bset_sector_offset(&b->keys, i));
K
Kent Overstreet 已提交
425

426
	if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
K
Kent Overstreet 已提交
427 428 429 430
		int j;
		struct bio_vec *bv;
		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));

431
		bio_for_each_segment_all(bv, b->bio, j)
K
Kent Overstreet 已提交
432 433 434 435 436
			memcpy(page_address(bv->bv_page),
			       base + j * PAGE_SIZE, PAGE_SIZE);

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

K
Kent Overstreet 已提交
437
		continue_at(cl, btree_node_write_done, NULL);
K
Kent Overstreet 已提交
438
	} else {
439
		/* No problem for multipage bvec since the bio is just allocated */
K
Kent Overstreet 已提交
440
		b->bio->bi_vcnt = 0;
441
		bch_bio_map(b->bio, i);
K
Kent Overstreet 已提交
442 443 444 445

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

		closure_sync(cl);
446
		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
K
Kent Overstreet 已提交
447 448 449
	}
}

K
Kent Overstreet 已提交
450
void __bch_btree_node_write(struct btree *b, struct closure *parent)
K
Kent Overstreet 已提交
451
{
452
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
453

K
Kent Overstreet 已提交
454 455
	lockdep_assert_held(&b->write_lock);

K
Kent Overstreet 已提交
456 457
	trace_bcache_btree_write(b);

K
Kent Overstreet 已提交
458
	BUG_ON(current->bio_list);
K
Kent Overstreet 已提交
459 460
	BUG_ON(b->written >= btree_blocks(b));
	BUG_ON(b->written && !i->keys);
461
	BUG_ON(btree_bset_first(b)->seq != i->seq);
462
	bch_check_keys(&b->keys, "writing");
K
Kent Overstreet 已提交
463 464 465

	cancel_delayed_work(&b->work);

K
Kent Overstreet 已提交
466
	/* If caller isn't waiting for write, parent refcount is cache set */
467 468
	down(&b->io_mutex);
	closure_init(&b->io, parent ?: &b->c->cl);
K
Kent Overstreet 已提交
469

K
Kent Overstreet 已提交
470 471 472
	clear_bit(BTREE_NODE_dirty,	 &b->flags);
	change_bit(BTREE_NODE_write_idx, &b->flags);

K
Kent Overstreet 已提交
473
	do_btree_node_write(b);
K
Kent Overstreet 已提交
474

475
	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
K
Kent Overstreet 已提交
476 477
			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);

K
Kent Overstreet 已提交
478
	b->written += set_blocks(i, block_bytes(b->c));
K
Kent Overstreet 已提交
479
}
K
Kent Overstreet 已提交
480

K
Kent Overstreet 已提交
481 482
void bch_btree_node_write(struct btree *b, struct closure *parent)
{
483
	unsigned int nsets = b->keys.nsets;
K
Kent Overstreet 已提交
484 485 486 487

	lockdep_assert_held(&b->lock);

	__bch_btree_node_write(b, parent);
K
Kent Overstreet 已提交
488

489 490 491 492
	/*
	 * do verify if there was more than one set initially (i.e. we did a
	 * sort) and we sorted down to a single set:
	 */
K
Kent Overstreet 已提交
493
	if (nsets && !b->keys.nsets)
494 495
		bch_btree_verify(b);

K
Kent Overstreet 已提交
496
	bch_btree_init_next(b);
K
Kent Overstreet 已提交
497 498
}

499 500 501 502 503
static void bch_btree_node_write_sync(struct btree *b)
{
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
504 505

	mutex_lock(&b->write_lock);
506
	bch_btree_node_write(b, &cl);
K
Kent Overstreet 已提交
507 508
	mutex_unlock(&b->write_lock);

509 510 511
	closure_sync(&cl);
}

K
Kent Overstreet 已提交
512
static void btree_node_write_work(struct work_struct *w)
K
Kent Overstreet 已提交
513 514 515
{
	struct btree *b = container_of(to_delayed_work(w), struct btree, work);

K
Kent Overstreet 已提交
516
	mutex_lock(&b->write_lock);
K
Kent Overstreet 已提交
517
	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
518 519
		__bch_btree_node_write(b, NULL);
	mutex_unlock(&b->write_lock);
K
Kent Overstreet 已提交
520 521
}

K
Kent Overstreet 已提交
522
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
K
Kent Overstreet 已提交
523
{
524
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
525 526
	struct btree_write *w = btree_current_write(b);

K
Kent Overstreet 已提交
527 528
	lockdep_assert_held(&b->write_lock);

K
Kent Overstreet 已提交
529 530
	BUG_ON(!b->written);
	BUG_ON(!i->keys);
K
Kent Overstreet 已提交
531

K
Kent Overstreet 已提交
532
	if (!btree_node_dirty(b))
K
Kent Overstreet 已提交
533
		schedule_delayed_work(&b->work, 30 * HZ);
K
Kent Overstreet 已提交
534

K
Kent Overstreet 已提交
535
	set_btree_node_dirty(b);
K
Kent Overstreet 已提交
536

K
Kent Overstreet 已提交
537
	if (journal_ref) {
K
Kent Overstreet 已提交
538
		if (w->journal &&
K
Kent Overstreet 已提交
539
		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
K
Kent Overstreet 已提交
540 541 542 543 544
			atomic_dec_bug(w->journal);
			w->journal = NULL;
		}

		if (!w->journal) {
K
Kent Overstreet 已提交
545
			w->journal = journal_ref;
K
Kent Overstreet 已提交
546 547 548 549 550
			atomic_inc(w->journal);
		}
	}

	/* Force write if set is too big */
K
Kent Overstreet 已提交
551 552 553
	if (set_bytes(i) > PAGE_SIZE - 48 &&
	    !current->bio_list)
		bch_btree_node_write(b, NULL);
K
Kent Overstreet 已提交
554 555 556 557 558 559 560 561 562 563
}

/*
 * Btree in memory cache - allocation/freeing
 * mca -> memory cache
 */

#define mca_reserve(c)	(((c->root && c->root->level)		\
			  ? c->root->level : 1) * 8 + 16)
#define mca_can_free(c)						\
564
	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
K
Kent Overstreet 已提交
565 566 567

static void mca_data_free(struct btree *b)
{
568
	BUG_ON(b->io_mutex.count != 1);
K
Kent Overstreet 已提交
569

K
Kent Overstreet 已提交
570
	bch_btree_keys_free(&b->keys);
K
Kent Overstreet 已提交
571

572
	b->c->btree_cache_used--;
573
	list_move(&b->list, &b->c->btree_cache_freed);
K
Kent Overstreet 已提交
574 575 576 577 578 579 580 581 582 583 584
}

static void mca_bucket_free(struct btree *b)
{
	BUG_ON(btree_node_dirty(b));

	b->key.ptr[0] = 0;
	hlist_del_init_rcu(&b->hash);
	list_move(&b->list, &b->c->btree_cache_freeable);
}

585
static unsigned int btree_order(struct bkey *k)
K
Kent Overstreet 已提交
586 587 588 589 590 591
{
	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
}

static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{
K
Kent Overstreet 已提交
592
	if (!bch_btree_keys_alloc(&b->keys,
593
				  max_t(unsigned int,
594 595 596
					ilog2(b->c->btree_pages),
					btree_order(k)),
				  gfp)) {
597
		b->c->btree_cache_used++;
598 599 600 601
		list_move(&b->list, &b->c->btree_cache);
	} else {
		list_move(&b->list, &b->c->btree_cache_freed);
	}
K
Kent Overstreet 已提交
602 603 604 605 606 607
}

static struct btree *mca_bucket_alloc(struct cache_set *c,
				      struct bkey *k, gfp_t gfp)
{
	struct btree *b = kzalloc(sizeof(struct btree), gfp);
608

K
Kent Overstreet 已提交
609 610 611 612 613
	if (!b)
		return NULL;

	init_rwsem(&b->lock);
	lockdep_set_novalidate_class(&b->lock);
K
Kent Overstreet 已提交
614 615
	mutex_init(&b->write_lock);
	lockdep_set_novalidate_class(&b->write_lock);
K
Kent Overstreet 已提交
616
	INIT_LIST_HEAD(&b->list);
K
Kent Overstreet 已提交
617
	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
K
Kent Overstreet 已提交
618
	b->c = c;
619
	sema_init(&b->io_mutex, 1);
K
Kent Overstreet 已提交
620 621 622 623 624

	mca_data_alloc(b, k, gfp);
	return b;
}

625
static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
K
Kent Overstreet 已提交
626
{
627 628 629
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
630 631 632 633 634
	lockdep_assert_held(&b->c->bucket_lock);

	if (!down_write_trylock(&b->lock))
		return -ENOMEM;

K
Kent Overstreet 已提交
635
	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
636

K
Kent Overstreet 已提交
637
	if (b->keys.page_order < min_order)
638 639 640 641 642 643 644 645 646
		goto out_unlock;

	if (!flush) {
		if (btree_node_dirty(b))
			goto out_unlock;

		if (down_trylock(&b->io_mutex))
			goto out_unlock;
		up(&b->io_mutex);
K
Kent Overstreet 已提交
647 648
	}

K
Kent Overstreet 已提交
649
	mutex_lock(&b->write_lock);
650
	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
651 652 653 654
		__bch_btree_node_write(b, &cl);
	mutex_unlock(&b->write_lock);

	closure_sync(&cl);
K
Kent Overstreet 已提交
655

656
	/* wait for any in flight btree write */
657 658
	down(&b->io_mutex);
	up(&b->io_mutex);
659

K
Kent Overstreet 已提交
660
	return 0;
661 662 663
out_unlock:
	rw_unlock(true, b);
	return -ENOMEM;
K
Kent Overstreet 已提交
664 665
}

666 667
static unsigned long bch_mca_scan(struct shrinker *shrink,
				  struct shrink_control *sc)
K
Kent Overstreet 已提交
668 669 670 671
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
	struct btree *b, *t;
	unsigned long i, nr = sc->nr_to_scan;
672
	unsigned long freed = 0;
673
	unsigned int btree_cache_used;
K
Kent Overstreet 已提交
674 675

	if (c->shrinker_disabled)
676
		return SHRINK_STOP;
K
Kent Overstreet 已提交
677

678
	if (c->btree_cache_alloc_lock)
679
		return SHRINK_STOP;
K
Kent Overstreet 已提交
680 681

	/* Return -1 if we can't do anything right now */
682
	if (sc->gfp_mask & __GFP_IO)
K
Kent Overstreet 已提交
683 684 685 686
		mutex_lock(&c->bucket_lock);
	else if (!mutex_trylock(&c->bucket_lock))
		return -1;

687 688 689 690 691 692 693
	/*
	 * It's _really_ critical that we don't free too many btree nodes - we
	 * have to always leave ourselves a reserve. The reserve is how we
	 * guarantee that allocating memory for a new btree node can always
	 * succeed, so that inserting keys into the btree can always succeed and
	 * IO can always make forward progress:
	 */
K
Kent Overstreet 已提交
694 695 696 697
	nr /= c->btree_pages;
	nr = min_t(unsigned long, nr, mca_can_free(c));

	i = 0;
698
	btree_cache_used = c->btree_cache_used;
K
Kent Overstreet 已提交
699
	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
700 701
		if (nr <= 0)
			goto out;
K
Kent Overstreet 已提交
702 703

		if (++i > 3 &&
704
		    !mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
705 706
			mca_data_free(b);
			rw_unlock(true, b);
707
			freed++;
K
Kent Overstreet 已提交
708
		}
709
		nr--;
K
Kent Overstreet 已提交
710 711
	}

712
	for (;  (nr--) && i < btree_cache_used; i++) {
K
Kent Overstreet 已提交
713 714 715
		if (list_empty(&c->btree_cache))
			goto out;

K
Kent Overstreet 已提交
716 717 718 719
		b = list_first_entry(&c->btree_cache, struct btree, list);
		list_rotate_left(&c->btree_cache);

		if (!b->accessed &&
720
		    !mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
721 722 723
			mca_bucket_free(b);
			mca_data_free(b);
			rw_unlock(true, b);
724
			freed++;
K
Kent Overstreet 已提交
725 726 727 728 729
		} else
			b->accessed = 0;
	}
out:
	mutex_unlock(&c->bucket_lock);
730
	return freed * c->btree_pages;
731 732 733 734 735 736 737 738 739 740
}

static unsigned long bch_mca_count(struct shrinker *shrink,
				   struct shrink_control *sc)
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);

	if (c->shrinker_disabled)
		return 0;

741
	if (c->btree_cache_alloc_lock)
742 743 744
		return 0;

	return mca_can_free(c) * c->btree_pages;
K
Kent Overstreet 已提交
745 746 747 748 749 750
}

void bch_btree_cache_free(struct cache_set *c)
{
	struct btree *b;
	struct closure cl;
751

K
Kent Overstreet 已提交
752 753 754 755 756 757 758 759 760 761
	closure_init_stack(&cl);

	if (c->shrink.list.next)
		unregister_shrinker(&c->shrink);

	mutex_lock(&c->bucket_lock);

#ifdef CONFIG_BCACHE_DEBUG
	if (c->verify_data)
		list_move(&c->verify_data->list, &c->btree_cache);
762 763

	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
K
Kent Overstreet 已提交
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
#endif

	list_splice(&c->btree_cache_freeable,
		    &c->btree_cache);

	while (!list_empty(&c->btree_cache)) {
		b = list_first_entry(&c->btree_cache, struct btree, list);

		if (btree_node_dirty(b))
			btree_complete_write(b, btree_current_write(b));
		clear_bit(BTREE_NODE_dirty, &b->flags);

		mca_data_free(b);
	}

	while (!list_empty(&c->btree_cache_freed)) {
		b = list_first_entry(&c->btree_cache_freed,
				     struct btree, list);
		list_del(&b->list);
		cancel_delayed_work_sync(&b->work);
		kfree(b);
	}

	mutex_unlock(&c->bucket_lock);
}

int bch_btree_cache_alloc(struct cache_set *c)
{
792
	unsigned int i;
K
Kent Overstreet 已提交
793 794

	for (i = 0; i < mca_reserve(c); i++)
K
Kent Overstreet 已提交
795 796
		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
			return -ENOMEM;
K
Kent Overstreet 已提交
797 798 799 800 801 802 803

	list_splice_init(&c->btree_cache,
			 &c->btree_cache_freeable);

#ifdef CONFIG_BCACHE_DEBUG
	mutex_init(&c->verify_lock);

804 805 806
	c->verify_ondisk = (void *)
		__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));

K
Kent Overstreet 已提交
807 808 809
	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);

	if (c->verify_data &&
K
Kent Overstreet 已提交
810
	    c->verify_data->keys.set->data)
K
Kent Overstreet 已提交
811 812 813 814 815
		list_del_init(&c->verify_data->list);
	else
		c->verify_data = NULL;
#endif

816 817
	c->shrink.count_objects = bch_mca_count;
	c->shrink.scan_objects = bch_mca_scan;
K
Kent Overstreet 已提交
818 819
	c->shrink.seeks = 4;
	c->shrink.batch = c->btree_pages * 2;
820 821 822 823

	if (register_shrinker(&c->shrink))
		pr_warn("bcache: %s: could not register shrinker",
				__func__);
K
Kent Overstreet 已提交
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848

	return 0;
}

/* Btree in memory cache - hash table */

static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
{
	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
}

static struct btree *mca_find(struct cache_set *c, struct bkey *k)
{
	struct btree *b;

	rcu_read_lock();
	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
			goto out;
	b = NULL;
out:
	rcu_read_unlock();
	return b;
}

849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
	struct task_struct *old;

	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
	if (old && old != current) {
		if (op)
			prepare_to_wait(&c->btree_cache_wait, &op->wait,
					TASK_UNINTERRUPTIBLE);
		return -EINTR;
	}

	return 0;
}

static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
				     struct bkey *k)
K
Kent Overstreet 已提交
866
{
867
	struct btree *b;
K
Kent Overstreet 已提交
868

K
Kent Overstreet 已提交
869 870
	trace_bcache_btree_cache_cannibalize(c);

871 872
	if (mca_cannibalize_lock(c, op))
		return ERR_PTR(-EINTR);
K
Kent Overstreet 已提交
873

874 875 876
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), false))
			return b;
K
Kent Overstreet 已提交
877

878 879 880
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), true))
			return b;
K
Kent Overstreet 已提交
881

882
	WARN(1, "btree cache cannibalize failed\n");
883
	return ERR_PTR(-ENOMEM);
K
Kent Overstreet 已提交
884 885 886 887 888 889 890 891
}

/*
 * We can only have one thread cannibalizing other cached btree nodes at a time,
 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 * cannibalize_bucket() will take. This means every time we unlock the root of
 * the btree, we need to release this lock if we have it held.
 */
892
static void bch_cannibalize_unlock(struct cache_set *c)
K
Kent Overstreet 已提交
893
{
894 895 896
	if (c->btree_cache_alloc_lock == current) {
		c->btree_cache_alloc_lock = NULL;
		wake_up(&c->btree_cache_wait);
K
Kent Overstreet 已提交
897 898 899
	}
}

900 901
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
			       struct bkey *k, int level)
K
Kent Overstreet 已提交
902 903 904
{
	struct btree *b;

905 906
	BUG_ON(current->bio_list);

K
Kent Overstreet 已提交
907 908 909 910 911 912 913 914 915
	lockdep_assert_held(&c->bucket_lock);

	if (mca_find(c, k))
		return NULL;

	/* btree_free() doesn't free memory; it sticks the node on the end of
	 * the list. Check if there's any freed nodes there:
	 */
	list_for_each_entry(b, &c->btree_cache_freeable, list)
916
		if (!mca_reap(b, btree_order(k), false))
K
Kent Overstreet 已提交
917 918 919 920 921 922
			goto out;

	/* We never free struct btree itself, just the memory that holds the on
	 * disk node. Check the freed list before allocating a new one:
	 */
	list_for_each_entry(b, &c->btree_cache_freed, list)
923
		if (!mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
924
			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
K
Kent Overstreet 已提交
925
			if (!b->keys.set[0].data)
K
Kent Overstreet 已提交
926 927 928 929 930 931 932 933 934 935
				goto err;
			else
				goto out;
		}

	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
	if (!b)
		goto err;

	BUG_ON(!down_write_trylock(&b->lock));
K
Kent Overstreet 已提交
936
	if (!b->keys.set->data)
K
Kent Overstreet 已提交
937 938
		goto err;
out:
939
	BUG_ON(b->io_mutex.count != 1);
K
Kent Overstreet 已提交
940 941 942 943 944 945 946

	bkey_copy(&b->key, k);
	list_move(&b->list, &c->btree_cache);
	hlist_del_init_rcu(&b->hash);
	hlist_add_head_rcu(&b->hash, mca_hash(c, k));

	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
947
	b->parent	= (void *) ~0UL;
K
Kent Overstreet 已提交
948 949 950
	b->flags	= 0;
	b->written	= 0;
	b->level	= level;
K
Kent Overstreet 已提交
951

952
	if (!b->level)
K
Kent Overstreet 已提交
953 954
		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
				    &b->c->expensive_debug_checks);
955
	else
K
Kent Overstreet 已提交
956 957
		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
				    &b->c->expensive_debug_checks);
K
Kent Overstreet 已提交
958 959 960 961 962 963

	return b;
err:
	if (b)
		rw_unlock(true, b);

964
	b = mca_cannibalize(c, op, k);
K
Kent Overstreet 已提交
965 966 967 968 969 970
	if (!IS_ERR(b))
		goto out;

	return b;
}

971
/*
K
Kent Overstreet 已提交
972 973 974
 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 * in from disk if necessary.
 *
K
Kent Overstreet 已提交
975
 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
K
Kent Overstreet 已提交
976 977 978 979
 *
 * The btree node will have either a read or a write lock held, depending on
 * level and op->lock.
 */
980
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
981 982
				 struct bkey *k, int level, bool write,
				 struct btree *parent)
K
Kent Overstreet 已提交
983 984 985 986 987 988 989 990 991
{
	int i = 0;
	struct btree *b;

	BUG_ON(level < 0);
retry:
	b = mca_find(c, k);

	if (!b) {
K
Kent Overstreet 已提交
992 993 994
		if (current->bio_list)
			return ERR_PTR(-EAGAIN);

K
Kent Overstreet 已提交
995
		mutex_lock(&c->bucket_lock);
996
		b = mca_alloc(c, op, k, level);
K
Kent Overstreet 已提交
997 998 999 1000 1001 1002 1003
		mutex_unlock(&c->bucket_lock);

		if (!b)
			goto retry;
		if (IS_ERR(b))
			return b;

K
Kent Overstreet 已提交
1004
		bch_btree_node_read(b);
K
Kent Overstreet 已提交
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016

		if (!write)
			downgrade_write(&b->lock);
	} else {
		rw_lock(write, b, level);
		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
			rw_unlock(write, b);
			goto retry;
		}
		BUG_ON(b->level != level);
	}

1017 1018 1019 1020 1021 1022 1023
	if (btree_node_io_error(b)) {
		rw_unlock(write, b);
		return ERR_PTR(-EIO);
	}

	BUG_ON(!b->written);

1024
	b->parent = parent;
K
Kent Overstreet 已提交
1025 1026
	b->accessed = 1;

K
Kent Overstreet 已提交
1027 1028 1029
	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
		prefetch(b->keys.set[i].tree);
		prefetch(b->keys.set[i].data);
K
Kent Overstreet 已提交
1030 1031
	}

K
Kent Overstreet 已提交
1032 1033
	for (; i <= b->keys.nsets; i++)
		prefetch(b->keys.set[i].data);
K
Kent Overstreet 已提交
1034 1035 1036 1037

	return b;
}

1038
static void btree_node_prefetch(struct btree *parent, struct bkey *k)
K
Kent Overstreet 已提交
1039 1040 1041
{
	struct btree *b;

1042 1043 1044
	mutex_lock(&parent->c->bucket_lock);
	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
	mutex_unlock(&parent->c->bucket_lock);
K
Kent Overstreet 已提交
1045 1046

	if (!IS_ERR_OR_NULL(b)) {
1047
		b->parent = parent;
K
Kent Overstreet 已提交
1048
		bch_btree_node_read(b);
K
Kent Overstreet 已提交
1049 1050 1051 1052 1053 1054
		rw_unlock(true, b);
	}
}

/* Btree alloc */

1055
static void btree_node_free(struct btree *b)
K
Kent Overstreet 已提交
1056
{
K
Kent Overstreet 已提交
1057 1058
	trace_bcache_btree_node_free(b);

K
Kent Overstreet 已提交
1059 1060
	BUG_ON(b == b->c->root);

K
Kent Overstreet 已提交
1061 1062
	mutex_lock(&b->write_lock);

K
Kent Overstreet 已提交
1063 1064 1065 1066
	if (btree_node_dirty(b))
		btree_complete_write(b, btree_current_write(b));
	clear_bit(BTREE_NODE_dirty, &b->flags);

K
Kent Overstreet 已提交
1067 1068
	mutex_unlock(&b->write_lock);

K
Kent Overstreet 已提交
1069 1070 1071 1072 1073 1074 1075 1076
	cancel_delayed_work(&b->work);

	mutex_lock(&b->c->bucket_lock);
	bch_bucket_free(b->c, &b->key);
	mca_bucket_free(b);
	mutex_unlock(&b->c->bucket_lock);
}

1077
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1078 1079
				     int level, bool wait,
				     struct btree *parent)
K
Kent Overstreet 已提交
1080 1081 1082 1083 1084 1085
{
	BKEY_PADDED(key) k;
	struct btree *b = ERR_PTR(-EAGAIN);

	mutex_lock(&c->bucket_lock);
retry:
1086
	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
K
Kent Overstreet 已提交
1087 1088
		goto err;

1089
	bkey_put(c, &k.key);
K
Kent Overstreet 已提交
1090 1091
	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);

1092
	b = mca_alloc(c, op, &k.key, level);
K
Kent Overstreet 已提交
1093 1094 1095 1096
	if (IS_ERR(b))
		goto err_free;

	if (!b) {
K
Kent Overstreet 已提交
1097 1098
		cache_bug(c,
			"Tried to allocate bucket that was in btree cache");
K
Kent Overstreet 已提交
1099 1100 1101 1102
		goto retry;
	}

	b->accessed = 1;
1103
	b->parent = parent;
K
Kent Overstreet 已提交
1104
	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
K
Kent Overstreet 已提交
1105 1106

	mutex_unlock(&c->bucket_lock);
K
Kent Overstreet 已提交
1107 1108

	trace_bcache_btree_node_alloc(b);
K
Kent Overstreet 已提交
1109 1110 1111 1112 1113
	return b;
err_free:
	bch_bucket_free(c, &k.key);
err:
	mutex_unlock(&c->bucket_lock);
K
Kent Overstreet 已提交
1114

1115
	trace_bcache_btree_node_alloc_fail(c);
K
Kent Overstreet 已提交
1116 1117 1118
	return b;
}

1119
static struct btree *bch_btree_node_alloc(struct cache_set *c,
1120 1121
					  struct btree_op *op, int level,
					  struct btree *parent)
1122
{
1123
	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1124 1125
}

1126 1127
static struct btree *btree_node_alloc_replacement(struct btree *b,
						  struct btree_op *op)
K
Kent Overstreet 已提交
1128
{
1129
	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1130

1131
	if (!IS_ERR_OR_NULL(n)) {
K
Kent Overstreet 已提交
1132
		mutex_lock(&n->write_lock);
1133
		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1134
		bkey_copy_key(&n->key, &b->key);
K
Kent Overstreet 已提交
1135
		mutex_unlock(&n->write_lock);
1136
	}
K
Kent Overstreet 已提交
1137 1138 1139 1140

	return n;
}

1141 1142
static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{
1143
	unsigned int i;
1144

1145 1146 1147 1148
	mutex_lock(&b->c->bucket_lock);

	atomic_inc(&b->c->prio_blocked);

1149 1150 1151
	bkey_copy(k, &b->key);
	bkey_copy_key(k, &ZERO_KEY);

1152 1153 1154 1155
	for (i = 0; i < KEY_PTRS(k); i++)
		SET_PTR_GEN(k, i,
			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
					PTR_BUCKET(b->c, &b->key, i)));
1156

1157
	mutex_unlock(&b->c->bucket_lock);
1158 1159
}

1160 1161 1162 1163
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
	struct cache_set *c = b->c;
	struct cache *ca;
1164
	unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
1165 1166 1167 1168 1169 1170

	mutex_lock(&c->bucket_lock);

	for_each_cache(ca, c, i)
		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
			if (op)
1171
				prepare_to_wait(&c->btree_cache_wait, &op->wait,
1172
						TASK_UNINTERRUPTIBLE);
1173 1174
			mutex_unlock(&c->bucket_lock);
			return -EINTR;
1175 1176 1177
		}

	mutex_unlock(&c->bucket_lock);
1178 1179

	return mca_cannibalize_lock(b->c, op);
1180 1181
}

K
Kent Overstreet 已提交
1182 1183
/* Garbage collection */

1184 1185
static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
				    struct bkey *k)
K
Kent Overstreet 已提交
1186 1187
{
	uint8_t stale = 0;
1188
	unsigned int i;
K
Kent Overstreet 已提交
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
	struct bucket *g;

	/*
	 * ptr_invalid() can't return true for the keys that mark btree nodes as
	 * freed, but since ptr_bad() returns true we'll never actually use them
	 * for anything and thus we don't want mark their pointers here
	 */
	if (!bkey_cmp(k, &ZERO_KEY))
		return stale;

	for (i = 0; i < KEY_PTRS(k); i++) {
		if (!ptr_available(c, k, i))
			continue;

		g = PTR_BUCKET(c, k, i);

K
Kent Overstreet 已提交
1205 1206
		if (gen_after(g->last_gc, PTR_GEN(k, i)))
			g->last_gc = PTR_GEN(k, i);
K
Kent Overstreet 已提交
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221

		if (ptr_stale(c, k, i)) {
			stale = max(stale, ptr_stale(c, k, i));
			continue;
		}

		cache_bug_on(GC_MARK(g) &&
			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
			     c, "inconsistent ptrs: mark = %llu, level = %i",
			     GC_MARK(g), level);

		if (level)
			SET_GC_MARK(g, GC_MARK_METADATA);
		else if (KEY_DIRTY(k))
			SET_GC_MARK(g, GC_MARK_DIRTY);
1222 1223
		else if (!GC_MARK(g))
			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
K
Kent Overstreet 已提交
1224 1225

		/* guard against overflow */
1226
		SET_GC_SECTORS_USED(g, min_t(unsigned int,
K
Kent Overstreet 已提交
1227
					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1228
					     MAX_GC_SECTORS_USED));
K
Kent Overstreet 已提交
1229 1230 1231 1232 1233 1234 1235 1236 1237

		BUG_ON(!GC_SECTORS_USED(g));
	}

	return stale;
}

#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)

1238 1239
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
{
1240
	unsigned int i;
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i) &&
		    !ptr_stale(c, k, i)) {
			struct bucket *b = PTR_BUCKET(c, k, i);

			b->gen = PTR_GEN(k, i);

			if (level && bkey_cmp(k, &ZERO_KEY))
				b->prio = BTREE_PRIO;
			else if (!level && b->prio == BTREE_PRIO)
				b->prio = INITIAL_PRIO;
		}

	__bch_btree_mark_key(c, level, k);
}

1258 1259 1260 1261 1262
void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
{
	stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
}

K
Kent Overstreet 已提交
1263
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
K
Kent Overstreet 已提交
1264 1265
{
	uint8_t stale = 0;
1266
	unsigned int keys = 0, good_keys = 0;
K
Kent Overstreet 已提交
1267 1268 1269 1270 1271 1272
	struct bkey *k;
	struct btree_iter iter;
	struct bset_tree *t;

	gc->nodes++;

1273
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
K
Kent Overstreet 已提交
1274
		stale = max(stale, btree_mark_key(b, k));
K
Kent Overstreet 已提交
1275
		keys++;
K
Kent Overstreet 已提交
1276

K
Kent Overstreet 已提交
1277
		if (bch_ptr_bad(&b->keys, k))
K
Kent Overstreet 已提交
1278 1279 1280 1281
			continue;

		gc->key_bytes += bkey_u64s(k);
		gc->nkeys++;
K
Kent Overstreet 已提交
1282
		good_keys++;
K
Kent Overstreet 已提交
1283 1284 1285 1286

		gc->data += KEY_SIZE(k);
	}

K
Kent Overstreet 已提交
1287
	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
K
Kent Overstreet 已提交
1288
		btree_bug_on(t->size &&
K
Kent Overstreet 已提交
1289
			     bset_written(&b->keys, t) &&
K
Kent Overstreet 已提交
1290 1291 1292
			     bkey_cmp(&b->key, &t->end) < 0,
			     b, "found short btree key in gc");

K
Kent Overstreet 已提交
1293 1294
	if (b->c->gc_always_rewrite)
		return true;
K
Kent Overstreet 已提交
1295

K
Kent Overstreet 已提交
1296 1297
	if (stale > 10)
		return true;
K
Kent Overstreet 已提交
1298

K
Kent Overstreet 已提交
1299 1300
	if ((keys - good_keys) * 2 > keys)
		return true;
K
Kent Overstreet 已提交
1301

K
Kent Overstreet 已提交
1302
	return false;
K
Kent Overstreet 已提交
1303 1304
}

K
Kent Overstreet 已提交
1305
#define GC_MERGE_NODES	4U
K
Kent Overstreet 已提交
1306 1307 1308

struct gc_merge_info {
	struct btree	*b;
1309
	unsigned int	keys;
K
Kent Overstreet 已提交
1310 1311
};

1312 1313 1314 1315
static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
				 struct keylist *insert_keys,
				 atomic_t *journal_ref,
				 struct bkey *replace_key);
K
Kent Overstreet 已提交
1316 1317

static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1318
			     struct gc_stat *gc, struct gc_merge_info *r)
K
Kent Overstreet 已提交
1319
{
1320
	unsigned int i, nodes = 0, keys = 0, blocks;
K
Kent Overstreet 已提交
1321
	struct btree *new_nodes[GC_MERGE_NODES];
1322
	struct keylist keylist;
K
Kent Overstreet 已提交
1323
	struct closure cl;
K
Kent Overstreet 已提交
1324
	struct bkey *k;
K
Kent Overstreet 已提交
1325

1326 1327 1328 1329 1330
	bch_keylist_init(&keylist);

	if (btree_check_reserve(b, NULL))
		return 0;

K
Kent Overstreet 已提交
1331
	memset(new_nodes, 0, sizeof(new_nodes));
K
Kent Overstreet 已提交
1332
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
1333

K
Kent Overstreet 已提交
1334
	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
K
Kent Overstreet 已提交
1335 1336 1337 1338 1339
		keys += r[nodes++].keys;

	blocks = btree_default_blocks(b->c) * 2 / 3;

	if (nodes < 2 ||
K
Kent Overstreet 已提交
1340
	    __set_blocks(b->keys.set[0].data, keys,
1341
			 block_bytes(b->c)) > blocks * (nodes - 1))
K
Kent Overstreet 已提交
1342
		return 0;
K
Kent Overstreet 已提交
1343

K
Kent Overstreet 已提交
1344
	for (i = 0; i < nodes; i++) {
1345
		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
K
Kent Overstreet 已提交
1346 1347
		if (IS_ERR_OR_NULL(new_nodes[i]))
			goto out_nocoalesce;
K
Kent Overstreet 已提交
1348 1349
	}

1350 1351 1352 1353 1354 1355 1356 1357 1358
	/*
	 * We have to check the reserve here, after we've allocated our new
	 * nodes, to make sure the insert below will succeed - we also check
	 * before as an optimization to potentially avoid a bunch of expensive
	 * allocs/sorts
	 */
	if (btree_check_reserve(b, NULL))
		goto out_nocoalesce;

K
Kent Overstreet 已提交
1359 1360 1361
	for (i = 0; i < nodes; i++)
		mutex_lock(&new_nodes[i]->write_lock);

K
Kent Overstreet 已提交
1362
	for (i = nodes - 1; i > 0; --i) {
1363 1364
		struct bset *n1 = btree_bset_first(new_nodes[i]);
		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
K
Kent Overstreet 已提交
1365 1366 1367 1368
		struct bkey *k, *last = NULL;

		keys = 0;

K
Kent Overstreet 已提交
1369 1370
		if (i > 1) {
			for (k = n2->start;
K
Kent Overstreet 已提交
1371
			     k < bset_bkey_last(n2);
K
Kent Overstreet 已提交
1372 1373
			     k = bkey_next(k)) {
				if (__set_blocks(n1, n1->keys + keys +
1374 1375
						 bkey_u64s(k),
						 block_bytes(b->c)) > blocks)
K
Kent Overstreet 已提交
1376 1377 1378 1379 1380 1381
					break;

				last = k;
				keys += bkey_u64s(k);
			}
		} else {
K
Kent Overstreet 已提交
1382 1383 1384 1385 1386 1387 1388 1389
			/*
			 * Last node we're not getting rid of - we're getting
			 * rid of the node at r[0]. Have to try and fit all of
			 * the remaining keys into this node; we can't ensure
			 * they will always fit due to rounding and variable
			 * length keys (shouldn't be possible in practice,
			 * though)
			 */
K
Kent Overstreet 已提交
1390
			if (__set_blocks(n1, n1->keys + n2->keys,
1391 1392
					 block_bytes(b->c)) >
			    btree_blocks(new_nodes[i]))
K
Kent Overstreet 已提交
1393
				goto out_nocoalesce;
K
Kent Overstreet 已提交
1394 1395

			keys = n2->keys;
K
Kent Overstreet 已提交
1396
			/* Take the key of the node we're getting rid of */
K
Kent Overstreet 已提交
1397
			last = &r->b->key;
K
Kent Overstreet 已提交
1398
		}
K
Kent Overstreet 已提交
1399

1400 1401
		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
		       btree_blocks(new_nodes[i]));
K
Kent Overstreet 已提交
1402

K
Kent Overstreet 已提交
1403 1404
		if (last)
			bkey_copy_key(&new_nodes[i]->key, last);
K
Kent Overstreet 已提交
1405

K
Kent Overstreet 已提交
1406
		memcpy(bset_bkey_last(n1),
K
Kent Overstreet 已提交
1407
		       n2->start,
K
Kent Overstreet 已提交
1408
		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
K
Kent Overstreet 已提交
1409 1410

		n1->keys += keys;
K
Kent Overstreet 已提交
1411
		r[i].keys = n1->keys;
K
Kent Overstreet 已提交
1412 1413

		memmove(n2->start,
K
Kent Overstreet 已提交
1414 1415 1416
			bset_bkey_idx(n2, keys),
			(void *) bset_bkey_last(n2) -
			(void *) bset_bkey_idx(n2, keys));
K
Kent Overstreet 已提交
1417 1418 1419

		n2->keys -= keys;

1420
		if (__bch_keylist_realloc(&keylist,
1421
					  bkey_u64s(&new_nodes[i]->key)))
K
Kent Overstreet 已提交
1422 1423 1424
			goto out_nocoalesce;

		bch_btree_node_write(new_nodes[i], &cl);
1425
		bch_keylist_add(&keylist, &new_nodes[i]->key);
K
Kent Overstreet 已提交
1426 1427
	}

K
Kent Overstreet 已提交
1428 1429 1430
	for (i = 0; i < nodes; i++)
		mutex_unlock(&new_nodes[i]->write_lock);

1431 1432 1433 1434 1435 1436
	closure_sync(&cl);

	/* We emptied out this node */
	BUG_ON(btree_bset_first(new_nodes[0])->keys);
	btree_node_free(new_nodes[0]);
	rw_unlock(true, new_nodes[0]);
1437
	new_nodes[0] = NULL;
1438

K
Kent Overstreet 已提交
1439
	for (i = 0; i < nodes; i++) {
1440
		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
K
Kent Overstreet 已提交
1441
			goto out_nocoalesce;
K
Kent Overstreet 已提交
1442

1443 1444
		make_btree_freeing_key(r[i].b, keylist.top);
		bch_keylist_push(&keylist);
K
Kent Overstreet 已提交
1445
	}
K
Kent Overstreet 已提交
1446

1447 1448
	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
	BUG_ON(!bch_keylist_empty(&keylist));
K
Kent Overstreet 已提交
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460

	for (i = 0; i < nodes; i++) {
		btree_node_free(r[i].b);
		rw_unlock(true, r[i].b);

		r[i].b = new_nodes[i];
	}

	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
	r[nodes - 1].b = ERR_PTR(-EINTR);

	trace_bcache_btree_gc_coalesce(nodes);
K
Kent Overstreet 已提交
1461 1462
	gc->nodes--;

1463 1464
	bch_keylist_free(&keylist);

K
Kent Overstreet 已提交
1465 1466 1467 1468 1469
	/* Invalidated our iterator */
	return -EINTR;

out_nocoalesce:
	closure_sync(&cl);
1470
	bch_keylist_free(&keylist);
K
Kent Overstreet 已提交
1471

1472
	while ((k = bch_keylist_pop(&keylist)))
K
Kent Overstreet 已提交
1473 1474 1475 1476 1477 1478 1479 1480 1481
		if (!bkey_cmp(k, &ZERO_KEY))
			atomic_dec(&b->c->prio_blocked);

	for (i = 0; i < nodes; i++)
		if (!IS_ERR_OR_NULL(new_nodes[i])) {
			btree_node_free(new_nodes[i]);
			rw_unlock(true, new_nodes[i]);
		}
	return 0;
K
Kent Overstreet 已提交
1482 1483
}

1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
				 struct btree *replace)
{
	struct keylist keys;
	struct btree *n;

	if (btree_check_reserve(b, NULL))
		return 0;

	n = btree_node_alloc_replacement(replace, NULL);

	/* recheck reserve after allocating replacement node */
	if (btree_check_reserve(b, NULL)) {
		btree_node_free(n);
		rw_unlock(true, n);
		return 0;
	}

	bch_btree_node_write_sync(n);

	bch_keylist_init(&keys);
	bch_keylist_add(&keys, &n->key);

	make_btree_freeing_key(replace, keys.top);
	bch_keylist_push(&keys);

	bch_btree_insert_node(b, op, &keys, NULL, NULL);
	BUG_ON(!bch_keylist_empty(&keys));

	btree_node_free(replace);
	rw_unlock(true, n);

	/* Invalidated our iterator */
	return -EINTR;
}

1520
static unsigned int btree_gc_count_keys(struct btree *b)
K
Kent Overstreet 已提交
1521
{
K
Kent Overstreet 已提交
1522 1523
	struct bkey *k;
	struct btree_iter iter;
1524
	unsigned int ret = 0;
K
Kent Overstreet 已提交
1525

1526
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
K
Kent Overstreet 已提交
1527 1528 1529 1530
		ret += bkey_u64s(k);

	return ret;
}
K
Kent Overstreet 已提交
1531

1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
static size_t btree_gc_min_nodes(struct cache_set *c)
{
	size_t min_nodes;

	/*
	 * Since incremental GC would stop 100ms when front
	 * side I/O comes, so when there are many btree nodes,
	 * if GC only processes constant (100) nodes each time,
	 * GC would last a long time, and the front side I/Os
	 * would run out of the buckets (since no new bucket
	 * can be allocated during GC), and be blocked again.
	 * So GC should not process constant nodes, but varied
	 * nodes according to the number of btree nodes, which
	 * realized by dividing GC into constant(100) times,
	 * so when there are many btree nodes, GC can process
	 * more nodes each time, otherwise, GC will process less
	 * nodes each time (but no less than MIN_GC_NODES)
	 */
	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
	if (min_nodes < MIN_GC_NODES)
		min_nodes = MIN_GC_NODES;

	return min_nodes;
}


K
Kent Overstreet 已提交
1558 1559 1560 1561 1562 1563 1564
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
			    struct closure *writes, struct gc_stat *gc)
{
	int ret = 0;
	bool should_rewrite;
	struct bkey *k;
	struct btree_iter iter;
K
Kent Overstreet 已提交
1565
	struct gc_merge_info r[GC_MERGE_NODES];
K
Kent Overstreet 已提交
1566
	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
K
Kent Overstreet 已提交
1567

1568
	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
K
Kent Overstreet 已提交
1569

K
Kent Overstreet 已提交
1570 1571
	for (i = r; i < r + ARRAY_SIZE(r); i++)
		i->b = ERR_PTR(-EINTR);
K
Kent Overstreet 已提交
1572

K
Kent Overstreet 已提交
1573
	while (1) {
K
Kent Overstreet 已提交
1574
		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
K
Kent Overstreet 已提交
1575
		if (k) {
1576
			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1577
						  true, b);
K
Kent Overstreet 已提交
1578 1579 1580 1581 1582 1583 1584
			if (IS_ERR(r->b)) {
				ret = PTR_ERR(r->b);
				break;
			}

			r->keys = btree_gc_count_keys(r->b);

1585
			ret = btree_gc_coalesce(b, op, gc, r);
K
Kent Overstreet 已提交
1586 1587
			if (ret)
				break;
K
Kent Overstreet 已提交
1588 1589
		}

K
Kent Overstreet 已提交
1590 1591
		if (!last->b)
			break;
K
Kent Overstreet 已提交
1592

K
Kent Overstreet 已提交
1593 1594
		if (!IS_ERR(last->b)) {
			should_rewrite = btree_gc_mark_node(last->b, gc);
1595 1596 1597
			if (should_rewrite) {
				ret = btree_gc_rewrite_node(b, op, last->b);
				if (ret)
K
Kent Overstreet 已提交
1598 1599 1600 1601 1602 1603 1604 1605
					break;
			}

			if (last->b->level) {
				ret = btree_gc_recurse(last->b, op, writes, gc);
				if (ret)
					break;
			}
K
Kent Overstreet 已提交
1606

K
Kent Overstreet 已提交
1607 1608 1609 1610 1611 1612
			bkey_copy_key(&b->c->gc_done, &last->b->key);

			/*
			 * Must flush leaf nodes before gc ends, since replace
			 * operations aren't journalled
			 */
K
Kent Overstreet 已提交
1613
			mutex_lock(&last->b->write_lock);
K
Kent Overstreet 已提交
1614 1615
			if (btree_node_dirty(last->b))
				bch_btree_node_write(last->b, writes);
K
Kent Overstreet 已提交
1616
			mutex_unlock(&last->b->write_lock);
K
Kent Overstreet 已提交
1617 1618 1619 1620 1621
			rw_unlock(true, last->b);
		}

		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
		r->b = NULL;
K
Kent Overstreet 已提交
1622

T
Tang Junhui 已提交
1623
		if (atomic_read(&b->c->search_inflight) &&
1624
		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
T
Tang Junhui 已提交
1625 1626 1627 1628 1629
			gc->nodes_pre =  gc->nodes;
			ret = -EAGAIN;
			break;
		}

K
Kent Overstreet 已提交
1630 1631 1632 1633 1634 1635
		if (need_resched()) {
			ret = -EAGAIN;
			break;
		}
	}

K
Kent Overstreet 已提交
1636 1637 1638 1639 1640 1641 1642
	for (i = r; i < r + ARRAY_SIZE(r); i++)
		if (!IS_ERR_OR_NULL(i->b)) {
			mutex_lock(&i->b->write_lock);
			if (btree_node_dirty(i->b))
				bch_btree_node_write(i->b, writes);
			mutex_unlock(&i->b->write_lock);
			rw_unlock(true, i->b);
K
Kent Overstreet 已提交
1643
		}
K
Kent Overstreet 已提交
1644 1645 1646 1647 1648 1649 1650 1651

	return ret;
}

static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
			     struct closure *writes, struct gc_stat *gc)
{
	struct btree *n = NULL;
K
Kent Overstreet 已提交
1652 1653
	int ret = 0;
	bool should_rewrite;
K
Kent Overstreet 已提交
1654

K
Kent Overstreet 已提交
1655 1656
	should_rewrite = btree_gc_mark_node(b, gc);
	if (should_rewrite) {
1657
		n = btree_node_alloc_replacement(b, NULL);
K
Kent Overstreet 已提交
1658

K
Kent Overstreet 已提交
1659 1660
		if (!IS_ERR_OR_NULL(n)) {
			bch_btree_node_write_sync(n);
K
Kent Overstreet 已提交
1661

K
Kent Overstreet 已提交
1662 1663 1664
			bch_btree_set_root(n);
			btree_node_free(b);
			rw_unlock(true, n);
K
Kent Overstreet 已提交
1665

K
Kent Overstreet 已提交
1666 1667 1668
			return -EINTR;
		}
	}
K
Kent Overstreet 已提交
1669

1670 1671
	__bch_btree_mark_key(b->c, b->level + 1, &b->key);

K
Kent Overstreet 已提交
1672 1673 1674 1675
	if (b->level) {
		ret = btree_gc_recurse(b, op, writes, gc);
		if (ret)
			return ret;
K
Kent Overstreet 已提交
1676 1677
	}

K
Kent Overstreet 已提交
1678 1679
	bkey_copy_key(&b->c->gc_done, &b->key);

K
Kent Overstreet 已提交
1680 1681 1682 1683 1684 1685 1686
	return ret;
}

static void btree_gc_start(struct cache_set *c)
{
	struct cache *ca;
	struct bucket *b;
1687
	unsigned int i;
K
Kent Overstreet 已提交
1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698

	if (!c->gc_mark_valid)
		return;

	mutex_lock(&c->bucket_lock);

	c->gc_mark_valid = 0;
	c->gc_done = ZERO_KEY;

	for_each_cache(ca, c, i)
		for_each_bucket(b, ca) {
K
Kent Overstreet 已提交
1699
			b->last_gc = b->gen;
1700
			if (!atomic_read(&b->pin)) {
1701
				SET_GC_MARK(b, 0);
1702 1703
				SET_GC_SECTORS_USED(b, 0);
			}
K
Kent Overstreet 已提交
1704 1705 1706 1707 1708
		}

	mutex_unlock(&c->bucket_lock);
}

1709
static void bch_btree_gc_finish(struct cache_set *c)
K
Kent Overstreet 已提交
1710 1711 1712
{
	struct bucket *b;
	struct cache *ca;
1713
	unsigned int i;
K
Kent Overstreet 已提交
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724

	mutex_lock(&c->bucket_lock);

	set_gc_sectors(c);
	c->gc_mark_valid = 1;
	c->need_gc	= 0;

	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
			    GC_MARK_METADATA);

1725 1726
	/* don't reclaim buckets to which writeback keys point */
	rcu_read_lock();
1727
	for (i = 0; i < c->devices_max_used; i++) {
1728 1729 1730
		struct bcache_device *d = c->devices[i];
		struct cached_dev *dc;
		struct keybuf_key *w, *n;
1731
		unsigned int j;
1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746

		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
			continue;
		dc = container_of(d, struct cached_dev, disk);

		spin_lock(&dc->writeback_keys.lock);
		rbtree_postorder_for_each_entry_safe(w, n,
					&dc->writeback_keys.keys, node)
			for (j = 0; j < KEY_PTRS(&w->key); j++)
				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
					    GC_MARK_DIRTY);
		spin_unlock(&dc->writeback_keys.lock);
	}
	rcu_read_unlock();

1747
	c->avail_nbuckets = 0;
K
Kent Overstreet 已提交
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
	for_each_cache(ca, c, i) {
		uint64_t *i;

		ca->invalidate_needs_gc = 0;

		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);

		for (i = ca->prio_buckets;
		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);

		for_each_bucket(b, ca) {
			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));

1763 1764 1765 1766 1767 1768
			if (atomic_read(&b->pin))
				continue;

			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));

			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1769
				c->avail_nbuckets++;
K
Kent Overstreet 已提交
1770 1771 1772 1773 1774 1775
		}
	}

	mutex_unlock(&c->bucket_lock);
}

K
Kent Overstreet 已提交
1776
static void bch_btree_gc(struct cache_set *c)
K
Kent Overstreet 已提交
1777 1778 1779 1780 1781 1782
{
	int ret;
	struct gc_stat stats;
	struct closure writes;
	struct btree_op op;
	uint64_t start_time = local_clock();
K
Kent Overstreet 已提交
1783

K
Kent Overstreet 已提交
1784
	trace_bcache_gc_start(c);
K
Kent Overstreet 已提交
1785 1786 1787

	memset(&stats, 0, sizeof(struct gc_stat));
	closure_init_stack(&writes);
K
Kent Overstreet 已提交
1788
	bch_btree_op_init(&op, SHRT_MAX);
K
Kent Overstreet 已提交
1789 1790 1791

	btree_gc_start(c);

1792
	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
K
Kent Overstreet 已提交
1793 1794 1795
	do {
		ret = btree_root(gc_root, c, &op, &writes, &stats);
		closure_sync(&writes);
1796
		cond_resched();
K
Kent Overstreet 已提交
1797

T
Tang Junhui 已提交
1798 1799 1800 1801
		if (ret == -EAGAIN)
			schedule_timeout_interruptible(msecs_to_jiffies
						       (GC_SLEEP_MS));
		else if (ret)
K
Kent Overstreet 已提交
1802
			pr_warn("gc failed!");
1803
	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
K
Kent Overstreet 已提交
1804

1805
	bch_btree_gc_finish(c);
K
Kent Overstreet 已提交
1806 1807
	wake_up_allocators(c);

1808
	bch_time_stats_update(&c->btree_gc_time, start_time);
K
Kent Overstreet 已提交
1809 1810 1811

	stats.key_bytes *= sizeof(uint64_t);
	stats.data	<<= 9;
1812
	bch_update_bucket_in_use(c, &stats);
K
Kent Overstreet 已提交
1813 1814
	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));

K
Kent Overstreet 已提交
1815
	trace_bcache_gc_end(c);
K
Kent Overstreet 已提交
1816

K
Kent Overstreet 已提交
1817 1818 1819
	bch_moving_gc(c);
}

1820
static bool gc_should_run(struct cache_set *c)
K
Kent Overstreet 已提交
1821
{
K
Kent Overstreet 已提交
1822
	struct cache *ca;
1823
	unsigned int i;
K
Kent Overstreet 已提交
1824

1825 1826 1827
	for_each_cache(ca, c, i)
		if (ca->invalidate_needs_gc)
			return true;
K
Kent Overstreet 已提交
1828

1829 1830
	if (atomic_read(&c->sectors_to_gc) < 0)
		return true;
K
Kent Overstreet 已提交
1831

1832 1833
	return false;
}
K
Kent Overstreet 已提交
1834

1835 1836 1837
static int bch_gc_thread(void *arg)
{
	struct cache_set *c = arg;
K
Kent Overstreet 已提交
1838

1839 1840
	while (1) {
		wait_event_interruptible(c->gc_wait,
1841 1842 1843
			   kthread_should_stop() ||
			   test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
			   gc_should_run(c));
K
Kent Overstreet 已提交
1844

1845 1846
		if (kthread_should_stop() ||
		    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1847 1848 1849 1850
			break;

		set_gc_sectors(c);
		bch_btree_gc(c);
K
Kent Overstreet 已提交
1851 1852
	}

1853
	wait_for_kthread_stop();
K
Kent Overstreet 已提交
1854
	return 0;
K
Kent Overstreet 已提交
1855 1856
}

K
Kent Overstreet 已提交
1857
int bch_gc_thread_start(struct cache_set *c)
K
Kent Overstreet 已提交
1858
{
1859
	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
V
Vasyl Gomonovych 已提交
1860
	return PTR_ERR_OR_ZERO(c->gc_thread);
K
Kent Overstreet 已提交
1861 1862 1863 1864
}

/* Initial partial gc */

1865
static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
K
Kent Overstreet 已提交
1866
{
1867 1868
	int ret = 0;
	struct bkey *k, *p = NULL;
K
Kent Overstreet 已提交
1869 1870
	struct btree_iter iter;

1871 1872
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
		bch_initial_mark_key(b->c, b->level, k);
K
Kent Overstreet 已提交
1873

1874
	bch_initial_mark_key(b->c, b->level + 1, &b->key);
K
Kent Overstreet 已提交
1875 1876

	if (b->level) {
1877
		bch_btree_iter_init(&b->keys, &iter, NULL);
K
Kent Overstreet 已提交
1878

1879
		do {
K
Kent Overstreet 已提交
1880 1881
			k = bch_btree_iter_next_filter(&iter, &b->keys,
						       bch_ptr_bad);
1882
			if (k) {
1883
				btree_node_prefetch(b, k);
1884 1885 1886 1887 1888 1889
				/*
				 * initiallize c->gc_stats.nodes
				 * for incremental GC
				 */
				b->c->gc_stats.nodes++;
			}
K
Kent Overstreet 已提交
1890

1891
			if (p)
1892
				ret = btree(check_recurse, p, b, op);
K
Kent Overstreet 已提交
1893

1894 1895
			p = k;
		} while (p && !ret);
K
Kent Overstreet 已提交
1896 1897
	}

1898
	return ret;
K
Kent Overstreet 已提交
1899 1900
}

K
Kent Overstreet 已提交
1901
int bch_btree_check(struct cache_set *c)
K
Kent Overstreet 已提交
1902
{
K
Kent Overstreet 已提交
1903
	struct btree_op op;
K
Kent Overstreet 已提交
1904

K
Kent Overstreet 已提交
1905
	bch_btree_op_init(&op, SHRT_MAX);
K
Kent Overstreet 已提交
1906

1907
	return btree_root(check_recurse, c, &op);
K
Kent Overstreet 已提交
1908 1909
}

K
Kent Overstreet 已提交
1910 1911 1912 1913
void bch_initial_gc_finish(struct cache_set *c)
{
	struct cache *ca;
	struct bucket *b;
1914
	unsigned int i;
K
Kent Overstreet 已提交
1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930

	bch_btree_gc_finish(c);

	mutex_lock(&c->bucket_lock);

	/*
	 * We need to put some unused buckets directly on the prio freelist in
	 * order to get the allocator thread started - it needs freed buckets in
	 * order to rewrite the prios and gens, and it needs to rewrite prios
	 * and gens in order to free buckets.
	 *
	 * This is only safe for buckets that have no live data in them, which
	 * there should always be some of.
	 */
	for_each_cache(ca, c, i) {
		for_each_bucket(b, ca) {
1931 1932
			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
			    fifo_full(&ca->free[RESERVE_BTREE]))
K
Kent Overstreet 已提交
1933 1934 1935 1936 1937
				break;

			if (bch_can_invalidate_bucket(ca, b) &&
			    !GC_MARK(b)) {
				__bch_invalidate_one_bucket(ca, b);
1938 1939 1940 1941
				if (!fifo_push(&ca->free[RESERVE_PRIO],
				   b - ca->buckets))
					fifo_push(&ca->free[RESERVE_BTREE],
						  b - ca->buckets);
K
Kent Overstreet 已提交
1942 1943 1944 1945 1946 1947 1948
			}
		}
	}

	mutex_unlock(&c->bucket_lock);
}

K
Kent Overstreet 已提交
1949 1950
/* Btree insertion */

1951 1952
static bool btree_insert_key(struct btree *b, struct bkey *k,
			     struct bkey *replace_key)
K
Kent Overstreet 已提交
1953
{
1954
	unsigned int status;
K
Kent Overstreet 已提交
1955 1956

	BUG_ON(bkey_cmp(k, &b->key) > 0);
1957

1958 1959 1960 1961
	status = bch_btree_insert_key(&b->keys, k, replace_key);
	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
		bch_check_keys(&b->keys, "%u for %s", status,
			       replace_key ? "replace" : "insert");
K
Kent Overstreet 已提交
1962

1963 1964 1965 1966 1967
		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
					      status);
		return true;
	} else
		return false;
K
Kent Overstreet 已提交
1968 1969
}

1970 1971
static size_t insert_u64s_remaining(struct btree *b)
{
1972
	long ret = bch_btree_keys_u64s_remaining(&b->keys);
1973 1974 1975 1976 1977 1978 1979 1980 1981 1982

	/*
	 * Might land in the middle of an existing extent and have to split it
	 */
	if (b->keys.ops->is_extents)
		ret -= KEY_MAX_U64S;

	return max(ret, 0L);
}

K
Kent Overstreet 已提交
1983
static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
K
Kent Overstreet 已提交
1984 1985
				  struct keylist *insert_keys,
				  struct bkey *replace_key)
K
Kent Overstreet 已提交
1986 1987
{
	bool ret = false;
1988
	int oldsize = bch_count_data(&b->keys);
K
Kent Overstreet 已提交
1989

K
Kent Overstreet 已提交
1990
	while (!bch_keylist_empty(insert_keys)) {
K
Kent Overstreet 已提交
1991
		struct bkey *k = insert_keys->keys;
K
Kent Overstreet 已提交
1992

1993
		if (bkey_u64s(k) > insert_u64s_remaining(b))
1994 1995 1996
			break;

		if (bkey_cmp(k, &b->key) <= 0) {
1997 1998
			if (!b->level)
				bkey_put(b->c, k);
K
Kent Overstreet 已提交
1999

2000
			ret |= btree_insert_key(b, k, replace_key);
K
Kent Overstreet 已提交
2001 2002 2003
			bch_keylist_pop_front(insert_keys);
		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
			BKEY_PADDED(key) temp;
K
Kent Overstreet 已提交
2004
			bkey_copy(&temp.key, insert_keys->keys);
K
Kent Overstreet 已提交
2005 2006

			bch_cut_back(&b->key, &temp.key);
K
Kent Overstreet 已提交
2007
			bch_cut_front(&b->key, insert_keys->keys);
K
Kent Overstreet 已提交
2008

2009
			ret |= btree_insert_key(b, &temp.key, replace_key);
K
Kent Overstreet 已提交
2010 2011 2012 2013
			break;
		} else {
			break;
		}
K
Kent Overstreet 已提交
2014 2015
	}

2016 2017 2018
	if (!ret)
		op->insert_collision = true;

2019 2020
	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);

2021
	BUG_ON(bch_count_data(&b->keys) < oldsize);
K
Kent Overstreet 已提交
2022 2023 2024
	return ret;
}

K
Kent Overstreet 已提交
2025 2026
static int btree_split(struct btree *b, struct btree_op *op,
		       struct keylist *insert_keys,
K
Kent Overstreet 已提交
2027
		       struct bkey *replace_key)
K
Kent Overstreet 已提交
2028
{
2029
	bool split;
K
Kent Overstreet 已提交
2030 2031
	struct btree *n1, *n2 = NULL, *n3 = NULL;
	uint64_t start_time = local_clock();
K
Kent Overstreet 已提交
2032
	struct closure cl;
2033
	struct keylist parent_keys;
K
Kent Overstreet 已提交
2034 2035

	closure_init_stack(&cl);
2036
	bch_keylist_init(&parent_keys);
K
Kent Overstreet 已提交
2037

2038 2039 2040 2041 2042 2043
	if (btree_check_reserve(b, op)) {
		if (!b->level)
			return -EINTR;
		else
			WARN(1, "insufficient reserve for split\n");
	}
2044

2045
	n1 = btree_node_alloc_replacement(b, op);
K
Kent Overstreet 已提交
2046 2047 2048
	if (IS_ERR(n1))
		goto err;

2049 2050
	split = set_blocks(btree_bset_first(n1),
			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
K
Kent Overstreet 已提交
2051 2052

	if (split) {
2053
		unsigned int keys = 0;
K
Kent Overstreet 已提交
2054

2055
		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
K
Kent Overstreet 已提交
2056

2057
		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
K
Kent Overstreet 已提交
2058 2059 2060
		if (IS_ERR(n2))
			goto err_free1;

2061
		if (!b->parent) {
2062
			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
K
Kent Overstreet 已提交
2063 2064 2065 2066
			if (IS_ERR(n3))
				goto err_free2;
		}

K
Kent Overstreet 已提交
2067 2068 2069
		mutex_lock(&n1->write_lock);
		mutex_lock(&n2->write_lock);

K
Kent Overstreet 已提交
2070
		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
K
Kent Overstreet 已提交
2071

2072 2073
		/*
		 * Has to be a linear search because we don't have an auxiliary
K
Kent Overstreet 已提交
2074 2075 2076
		 * search tree yet
		 */

2077 2078
		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
K
Kent Overstreet 已提交
2079
							keys));
K
Kent Overstreet 已提交
2080

K
Kent Overstreet 已提交
2081
		bkey_copy_key(&n1->key,
2082 2083
			      bset_bkey_idx(btree_bset_first(n1), keys));
		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
K
Kent Overstreet 已提交
2084

2085 2086
		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
		btree_bset_first(n1)->keys = keys;
K
Kent Overstreet 已提交
2087

2088 2089 2090
		memcpy(btree_bset_first(n2)->start,
		       bset_bkey_last(btree_bset_first(n1)),
		       btree_bset_first(n2)->keys * sizeof(uint64_t));
K
Kent Overstreet 已提交
2091 2092 2093

		bkey_copy_key(&n2->key, &b->key);

2094
		bch_keylist_add(&parent_keys, &n2->key);
K
Kent Overstreet 已提交
2095
		bch_btree_node_write(n2, &cl);
K
Kent Overstreet 已提交
2096
		mutex_unlock(&n2->write_lock);
K
Kent Overstreet 已提交
2097
		rw_unlock(true, n2);
K
Kent Overstreet 已提交
2098
	} else {
2099
		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
K
Kent Overstreet 已提交
2100

K
Kent Overstreet 已提交
2101
		mutex_lock(&n1->write_lock);
K
Kent Overstreet 已提交
2102
		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
K
Kent Overstreet 已提交
2103
	}
K
Kent Overstreet 已提交
2104

2105
	bch_keylist_add(&parent_keys, &n1->key);
K
Kent Overstreet 已提交
2106
	bch_btree_node_write(n1, &cl);
K
Kent Overstreet 已提交
2107
	mutex_unlock(&n1->write_lock);
K
Kent Overstreet 已提交
2108 2109

	if (n3) {
2110
		/* Depth increases, make a new root */
K
Kent Overstreet 已提交
2111
		mutex_lock(&n3->write_lock);
K
Kent Overstreet 已提交
2112
		bkey_copy_key(&n3->key, &MAX_KEY);
2113
		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
K
Kent Overstreet 已提交
2114
		bch_btree_node_write(n3, &cl);
K
Kent Overstreet 已提交
2115
		mutex_unlock(&n3->write_lock);
K
Kent Overstreet 已提交
2116

K
Kent Overstreet 已提交
2117
		closure_sync(&cl);
K
Kent Overstreet 已提交
2118 2119
		bch_btree_set_root(n3);
		rw_unlock(true, n3);
2120 2121
	} else if (!b->parent) {
		/* Root filled up but didn't need to be split */
K
Kent Overstreet 已提交
2122
		closure_sync(&cl);
K
Kent Overstreet 已提交
2123 2124
		bch_btree_set_root(n1);
	} else {
2125
		/* Split a non root node */
K
Kent Overstreet 已提交
2126
		closure_sync(&cl);
2127 2128 2129 2130 2131
		make_btree_freeing_key(b, parent_keys.top);
		bch_keylist_push(&parent_keys);

		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
		BUG_ON(!bch_keylist_empty(&parent_keys));
K
Kent Overstreet 已提交
2132 2133
	}

2134
	btree_node_free(b);
K
Kent Overstreet 已提交
2135 2136
	rw_unlock(true, n1);

2137
	bch_time_stats_update(&b->c->btree_split_time, start_time);
K
Kent Overstreet 已提交
2138 2139 2140

	return 0;
err_free2:
2141
	bkey_put(b->c, &n2->key);
2142
	btree_node_free(n2);
K
Kent Overstreet 已提交
2143 2144
	rw_unlock(true, n2);
err_free1:
2145
	bkey_put(b->c, &n1->key);
2146
	btree_node_free(n1);
K
Kent Overstreet 已提交
2147 2148
	rw_unlock(true, n1);
err:
2149
	WARN(1, "bcache: btree split failed (level %u)", b->level);
2150

K
Kent Overstreet 已提交
2151 2152 2153 2154 2155 2156 2157 2158
	if (n3 == ERR_PTR(-EAGAIN) ||
	    n2 == ERR_PTR(-EAGAIN) ||
	    n1 == ERR_PTR(-EAGAIN))
		return -EAGAIN;

	return -ENOMEM;
}

K
Kent Overstreet 已提交
2159
static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
K
Kent Overstreet 已提交
2160
				 struct keylist *insert_keys,
K
Kent Overstreet 已提交
2161 2162
				 atomic_t *journal_ref,
				 struct bkey *replace_key)
K
Kent Overstreet 已提交
2163
{
K
Kent Overstreet 已提交
2164 2165
	struct closure cl;

2166 2167
	BUG_ON(b->level && replace_key);

K
Kent Overstreet 已提交
2168 2169 2170 2171 2172 2173 2174 2175
	closure_init_stack(&cl);

	mutex_lock(&b->write_lock);

	if (write_block(b) != btree_bset_last(b) &&
	    b->keys.last_set_unwritten)
		bch_btree_init_next(b); /* just wrote a set */

2176
	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
K
Kent Overstreet 已提交
2177 2178 2179
		mutex_unlock(&b->write_lock);
		goto split;
	}
2180

K
Kent Overstreet 已提交
2181
	BUG_ON(write_block(b) != btree_bset_last(b));
K
Kent Overstreet 已提交
2182

K
Kent Overstreet 已提交
2183 2184 2185 2186 2187 2188
	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
		if (!b->level)
			bch_btree_leaf_dirty(b, journal_ref);
		else
			bch_btree_node_write(b, &cl);
	}
2189

K
Kent Overstreet 已提交
2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211
	mutex_unlock(&b->write_lock);

	/* wait for btree node write if necessary, after unlock */
	closure_sync(&cl);

	return 0;
split:
	if (current->bio_list) {
		op->lock = b->c->root->level + 1;
		return -EAGAIN;
	} else if (op->lock <= b->c->root->level) {
		op->lock = b->c->root->level + 1;
		return -EINTR;
	} else {
		/* Invalidated all iterators */
		int ret = btree_split(b, op, insert_keys, replace_key);

		if (bch_keylist_empty(insert_keys))
			return 0;
		else if (!ret)
			return -EINTR;
		return ret;
2212
	}
K
Kent Overstreet 已提交
2213
}
K
Kent Overstreet 已提交
2214

2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230
int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
			       struct bkey *check_key)
{
	int ret = -EINTR;
	uint64_t btree_ptr = b->key.ptr[0];
	unsigned long seq = b->seq;
	struct keylist insert;
	bool upgrade = op->lock == -1;

	bch_keylist_init(&insert);

	if (upgrade) {
		rw_unlock(false, b);
		rw_lock(true, b, b->level);

		if (b->key.ptr[0] != btree_ptr ||
2231
                   b->seq != seq + 1) {
B
Bart Van Assche 已提交
2232
			op->lock = b->level;
2233
			goto out;
2234
               }
2235 2236 2237 2238 2239 2240 2241 2242 2243
	}

	SET_KEY_PTRS(check_key, 1);
	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));

	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);

	bch_keylist_add(&insert, check_key);

K
Kent Overstreet 已提交
2244
	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2245 2246 2247 2248 2249 2250 2251 2252

	BUG_ON(!ret && !bch_keylist_empty(&insert));
out:
	if (upgrade)
		downgrade_write(&b->lock);
	return ret;
}

2253 2254 2255 2256 2257 2258
struct btree_insert_op {
	struct btree_op	op;
	struct keylist	*keys;
	atomic_t	*journal_ref;
	struct bkey	*replace_key;
};
K
Kent Overstreet 已提交
2259

2260
static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2261 2262 2263
{
	struct btree_insert_op *op = container_of(b_op,
					struct btree_insert_op, op);
K
Kent Overstreet 已提交
2264

2265 2266 2267 2268 2269 2270
	int ret = bch_btree_insert_node(b, &op->op, op->keys,
					op->journal_ref, op->replace_key);
	if (ret && !bch_keylist_empty(op->keys))
		return ret;
	else
		return MAP_DONE;
K
Kent Overstreet 已提交
2271 2272
}

2273 2274
int bch_btree_insert(struct cache_set *c, struct keylist *keys,
		     atomic_t *journal_ref, struct bkey *replace_key)
K
Kent Overstreet 已提交
2275
{
2276
	struct btree_insert_op op;
K
Kent Overstreet 已提交
2277 2278
	int ret = 0;

2279
	BUG_ON(current->bio_list);
2280
	BUG_ON(bch_keylist_empty(keys));
K
Kent Overstreet 已提交
2281

2282 2283 2284 2285
	bch_btree_op_init(&op.op, 0);
	op.keys		= keys;
	op.journal_ref	= journal_ref;
	op.replace_key	= replace_key;
K
Kent Overstreet 已提交
2286

2287 2288 2289 2290 2291 2292
	while (!ret && !bch_keylist_empty(keys)) {
		op.op.lock = 0;
		ret = bch_btree_map_leaf_nodes(&op.op, c,
					       &START_KEY(keys->keys),
					       btree_insert_fn);
	}
K
Kent Overstreet 已提交
2293

2294 2295
	if (ret) {
		struct bkey *k;
K
Kent Overstreet 已提交
2296

2297
		pr_err("error %i", ret);
K
Kent Overstreet 已提交
2298

2299
		while ((k = bch_keylist_pop(keys)))
2300
			bkey_put(c, k);
2301 2302
	} else if (op.op.insert_collision)
		ret = -ESRCH;
2303

K
Kent Overstreet 已提交
2304 2305 2306 2307 2308
	return ret;
}

void bch_btree_set_root(struct btree *b)
{
2309
	unsigned int i;
K
Kent Overstreet 已提交
2310 2311 2312
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
2313

K
Kent Overstreet 已提交
2314 2315
	trace_bcache_btree_set_root(b);

K
Kent Overstreet 已提交
2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326
	BUG_ON(!b->written);

	for (i = 0; i < KEY_PTRS(&b->key); i++)
		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);

	mutex_lock(&b->c->bucket_lock);
	list_del_init(&b->list);
	mutex_unlock(&b->c->bucket_lock);

	b->c->root = b;

K
Kent Overstreet 已提交
2327 2328
	bch_journal_meta(b->c, &cl);
	closure_sync(&cl);
K
Kent Overstreet 已提交
2329 2330
}

2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342
/* Map across nodes or keys */

static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
				       struct bkey *from,
				       btree_map_nodes_fn *fn, int flags)
{
	int ret = MAP_CONTINUE;

	if (b->level) {
		struct bkey *k;
		struct btree_iter iter;

2343
		bch_btree_iter_init(&b->keys, &iter, from);
2344

K
Kent Overstreet 已提交
2345
		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364
						       bch_ptr_bad))) {
			ret = btree(map_nodes_recurse, k, b,
				    op, from, fn, flags);
			from = NULL;

			if (ret != MAP_CONTINUE)
				return ret;
		}
	}

	if (!b->level || flags == MAP_ALL_NODES)
		ret = fn(op, b);

	return ret;
}

int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
{
K
Kent Overstreet 已提交
2365
	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2366 2367 2368 2369 2370 2371 2372 2373 2374 2375
}

static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
				      struct bkey *from, btree_map_keys_fn *fn,
				      int flags)
{
	int ret = MAP_CONTINUE;
	struct bkey *k;
	struct btree_iter iter;

2376
	bch_btree_iter_init(&b->keys, &iter, from);
2377

K
Kent Overstreet 已提交
2378
	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397
		ret = !b->level
			? fn(op, b, k)
			: btree(map_keys_recurse, k, b, op, from, fn, flags);
		from = NULL;

		if (ret != MAP_CONTINUE)
			return ret;
	}

	if (!b->level && (flags & MAP_END_KEY))
		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
				     KEY_OFFSET(&b->key), 0));

	return ret;
}

int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
		       struct bkey *from, btree_map_keys_fn *fn, int flags)
{
K
Kent Overstreet 已提交
2398
	return btree_root(map_keys_recurse, c, op, from, fn, flags);
2399 2400
}

K
Kent Overstreet 已提交
2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418
/* Keybuf code */

static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
{
	/* Overlapping keys compare equal */
	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
		return -1;
	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
		return 1;
	return 0;
}

static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
					    struct keybuf_key *r)
{
	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
}

2419 2420
struct refill {
	struct btree_op	op;
2421
	unsigned int	nr_found;
2422 2423 2424 2425
	struct keybuf	*buf;
	struct bkey	*end;
	keybuf_pred_fn	*pred;
};
K
Kent Overstreet 已提交
2426

2427 2428 2429 2430 2431 2432
static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
			    struct bkey *k)
{
	struct refill *refill = container_of(op, struct refill, op);
	struct keybuf *buf = refill->buf;
	int ret = MAP_CONTINUE;
K
Kent Overstreet 已提交
2433

2434 2435 2436 2437
	if (bkey_cmp(k, refill->end) >= 0) {
		ret = MAP_DONE;
		goto out;
	}
K
Kent Overstreet 已提交
2438

2439 2440
	if (!KEY_SIZE(k)) /* end key */
		goto out;
K
Kent Overstreet 已提交
2441

2442 2443
	if (refill->pred(buf, k)) {
		struct keybuf_key *w;
K
Kent Overstreet 已提交
2444

2445
		spin_lock(&buf->lock);
K
Kent Overstreet 已提交
2446

2447 2448 2449 2450 2451
		w = array_alloc(&buf->freelist);
		if (!w) {
			spin_unlock(&buf->lock);
			return MAP_DONE;
		}
K
Kent Overstreet 已提交
2452

2453 2454
		w->private = NULL;
		bkey_copy(&w->key, k);
K
Kent Overstreet 已提交
2455

2456 2457
		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
			array_free(&buf->freelist, w);
2458 2459
		else
			refill->nr_found++;
K
Kent Overstreet 已提交
2460

2461 2462
		if (array_freelist_empty(&buf->freelist))
			ret = MAP_DONE;
K
Kent Overstreet 已提交
2463

2464
		spin_unlock(&buf->lock);
K
Kent Overstreet 已提交
2465
	}
2466 2467 2468
out:
	buf->last_scanned = *k;
	return ret;
K
Kent Overstreet 已提交
2469 2470 2471
}

void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
K
Kent Overstreet 已提交
2472
		       struct bkey *end, keybuf_pred_fn *pred)
K
Kent Overstreet 已提交
2473 2474
{
	struct bkey start = buf->last_scanned;
2475
	struct refill refill;
K
Kent Overstreet 已提交
2476 2477 2478

	cond_resched();

K
Kent Overstreet 已提交
2479
	bch_btree_op_init(&refill.op, -1);
2480 2481 2482 2483
	refill.nr_found	= 0;
	refill.buf	= buf;
	refill.end	= end;
	refill.pred	= pred;
2484 2485 2486

	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
			   refill_keybuf_fn, MAP_END_KEY);
K
Kent Overstreet 已提交
2487

2488 2489 2490 2491
	trace_bcache_keyscan(refill.nr_found,
			     KEY_INODE(&start), KEY_OFFSET(&start),
			     KEY_INODE(&buf->last_scanned),
			     KEY_OFFSET(&buf->last_scanned));
K
Kent Overstreet 已提交
2492 2493 2494 2495 2496

	spin_lock(&buf->lock);

	if (!RB_EMPTY_ROOT(&buf->keys)) {
		struct keybuf_key *w;
2497

K
Kent Overstreet 已提交
2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
		buf->start	= START_KEY(&w->key);

		w = RB_LAST(&buf->keys, struct keybuf_key, node);
		buf->end	= w->key;
	} else {
		buf->start	= MAX_KEY;
		buf->end	= MAX_KEY;
	}

	spin_unlock(&buf->lock);
}

static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{
	rb_erase(&w->node, &buf->keys);
	array_free(&buf->freelist, w);
}

void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{
	spin_lock(&buf->lock);
	__bch_keybuf_del(buf, w);
	spin_unlock(&buf->lock);
}

bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
				  struct bkey *end)
{
	bool ret = false;
	struct keybuf_key *p, *w, s;
2529

K
Kent Overstreet 已提交
2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555
	s.key = *start;

	if (bkey_cmp(end, &buf->start) <= 0 ||
	    bkey_cmp(start, &buf->end) >= 0)
		return false;

	spin_lock(&buf->lock);
	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);

	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
		p = w;
		w = RB_NEXT(w, node);

		if (p->private)
			ret = true;
		else
			__bch_keybuf_del(buf, p);
	}

	spin_unlock(&buf->lock);
	return ret;
}

struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
{
	struct keybuf_key *w;
2556

K
Kent Overstreet 已提交
2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571
	spin_lock(&buf->lock);

	w = RB_FIRST(&buf->keys, struct keybuf_key, node);

	while (w && w->private)
		w = RB_NEXT(w, node);

	if (w)
		w->private = ERR_PTR(-EINTR);

	spin_unlock(&buf->lock);
	return w;
}

struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2572 2573 2574
					  struct keybuf *buf,
					  struct bkey *end,
					  keybuf_pred_fn *pred)
K
Kent Overstreet 已提交
2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587
{
	struct keybuf_key *ret;

	while (1) {
		ret = bch_keybuf_next(buf);
		if (ret)
			break;

		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
			pr_debug("scan finished");
			break;
		}

K
Kent Overstreet 已提交
2588
		bch_refill_keybuf(c, buf, end, pred);
K
Kent Overstreet 已提交
2589 2590 2591 2592 2593
	}

	return ret;
}

K
Kent Overstreet 已提交
2594
void bch_keybuf_init(struct keybuf *buf)
K
Kent Overstreet 已提交
2595 2596 2597 2598 2599 2600 2601
{
	buf->last_scanned	= MAX_KEY;
	buf->keys		= RB_ROOT;

	spin_lock_init(&buf->lock);
	array_allocator_init(&buf->freelist);
}