btree.c 58.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
K
Kent Overstreet 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
 *
 * Uses a block device as cache for other block devices; optimized for SSDs.
 * All allocation is done in buckets, which should match the erase block size
 * of the device.
 *
 * Buckets containing cached data are kept on a heap sorted by priority;
 * bucket priority is increased on cache hit, and periodically all the buckets
 * on the heap have their priority scaled down. This currently is just used as
 * an LRU but in the future should allow for more intelligent heuristics.
 *
 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
 * counter. Garbage collection is used to remove stale pointers.
 *
 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
 * as keys are inserted we only sort the pages that have not yet been written.
 * When garbage collection is run, we resort the entire node.
 *
21
 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
K
Kent Overstreet 已提交
22 23 24 25 26
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
27
#include "extents.h"
K
Kent Overstreet 已提交
28 29 30 31

#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/hash.h>
K
Kent Overstreet 已提交
32
#include <linux/kthread.h>
33
#include <linux/prefetch.h>
K
Kent Overstreet 已提交
34 35
#include <linux/random.h>
#include <linux/rcupdate.h>
36
#include <linux/sched/clock.h>
37 38
#include <linux/rculist.h>

K
Kent Overstreet 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
#include <trace/events/bcache.h>

/*
 * Todo:
 * register_bcache: Return errors out to userspace correctly
 *
 * Writeback: don't undirty key until after a cache flush
 *
 * Create an iterator for key pointers
 *
 * On btree write error, mark bucket such that it won't be freed from the cache
 *
 * Journalling:
 *   Check for bad keys in replay
 *   Propagate barriers
 *   Refcount journal entries in journal_replay
 *
 * Garbage collection:
 *   Finish incremental gc
 *   Gc should free old UUIDs, data for invalid UUIDs
 *
 * Provide a way to list backing device UUIDs we have data cached for, and
 * probably how long it's been since we've seen them, and a way to invalidate
 * dirty data for devices that will never be attached again
 *
 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
 * that based on that and how much dirty data we have we can keep writeback
 * from being starved
 *
 * Add a tracepoint or somesuch to watch for writeback starvation
 *
 * When btree depth > 1 and splitting an interior node, we have to make sure
 * alloc_bucket() cannot fail. This should be true but is not completely
 * obvious.
 *
 * Plugging?
 *
 * If data write is less than hard sector size of ssd, round up offset in open
 * bucket to the next whole sector
 *
 * Superblock needs to be fleshed out for multiple cache devices
 *
 * Add a sysfs tunable for the number of writeback IOs in flight
 *
 * Add a sysfs tunable for the number of open data buckets
 *
 * IO tracking: Can we track when one process is doing io on behalf of another?
 * IO tracking: Don't use just an average, weigh more recent stuff higher
 *
 * Test module load/unload
 */

#define MAX_NEED_GC		64
#define MAX_SAVE_PRIO		72
93
#define MAX_GC_TIMES		100
T
Tang Junhui 已提交
94 95
#define MIN_GC_NODES		100
#define GC_SLEEP_MS		100
K
Kent Overstreet 已提交
96 97 98 99 100 101

#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))

#define PTR_HASH(c, k)							\
	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
#define insert_lock(s, b)	((b)->level <= (s)->lock)

/*
 * These macros are for recursing down the btree - they handle the details of
 * locking and looking up nodes in the cache for you. They're best treated as
 * mere syntax when reading code that uses them.
 *
 * op->lock determines whether we take a read or a write lock at a given depth.
 * If you've got a read lock and find that you need a write lock (i.e. you're
 * going to have to split), set op->lock and return -EINTR; btree_root() will
 * call you again and you'll have the correct lock.
 */

/**
 * btree - recurse down the btree on a specified key
 * @fn:		function to call, which will be passed the child node
 * @key:	key to recurse on
 * @b:		parent btree node
 * @op:		pointer to struct btree_op
 */
#define btree(fn, key, b, op, ...)					\
({									\
	int _r, l = (b)->level - 1;					\
	bool _w = l <= (op)->lock;					\
126 127
	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
						  _w, b);		\
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
	if (!IS_ERR(_child)) {						\
		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
		rw_unlock(_w, _child);					\
	} else								\
		_r = PTR_ERR(_child);					\
	_r;								\
})

/**
 * btree_root - call a function on the root of the btree
 * @fn:		function to call, which will be passed the child node
 * @c:		cache set
 * @op:		pointer to struct btree_op
 */
#define btree_root(fn, c, op, ...)					\
({									\
	int _r = -EINTR;						\
	do {								\
		struct btree *_b = (c)->root;				\
		bool _w = insert_lock(op, _b);				\
		rw_lock(_w, _b, _b->level);				\
		if (_b == (c)->root &&					\
		    _w == insert_lock(op, _b)) {			\
			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
		}							\
		rw_unlock(_w, _b);					\
154
		bch_cannibalize_unlock(c);				\
155 156
		if (_r == -EINTR)					\
			schedule();					\
157 158
	} while (_r == -EINTR);						\
									\
159
	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
160 161 162
	_r;								\
})

K
Kent Overstreet 已提交
163 164 165 166 167
static inline struct bset *write_block(struct btree *b)
{
	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
}

K
Kent Overstreet 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181
static void bch_btree_init_next(struct btree *b)
{
	/* If not a leaf node, always sort */
	if (b->level && b->keys.nsets)
		bch_btree_sort(&b->keys, &b->c->sort);
	else
		bch_btree_sort_lazy(&b->keys, &b->c->sort);

	if (b->written < btree_blocks(b))
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));

}

K
Kent Overstreet 已提交
182 183
/* Btree key manipulation */

184
void bkey_put(struct cache_set *c, struct bkey *k)
185
{
186
	unsigned int i;
187 188 189 190 191 192

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i))
			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}

K
Kent Overstreet 已提交
193 194 195 196 197
/* Btree IO */

static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{
	uint64_t crc = b->key.ptr[0];
K
Kent Overstreet 已提交
198
	void *data = (void *) i + 8, *end = bset_bkey_last(i);
K
Kent Overstreet 已提交
199

200
	crc = bch_crc64_update(crc, data, end - data);
K
Kent Overstreet 已提交
201
	return crc ^ 0xffffffffffffffffULL;
K
Kent Overstreet 已提交
202 203
}

204
void bch_btree_node_read_done(struct btree *b)
K
Kent Overstreet 已提交
205 206
{
	const char *err = "bad btree header";
207
	struct bset *i = btree_bset_first(b);
K
Kent Overstreet 已提交
208
	struct btree_iter *iter;
K
Kent Overstreet 已提交
209

210
	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
K
Kent Overstreet 已提交
211
	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
K
Kent Overstreet 已提交
212 213
	iter->used = 0;

K
Kent Overstreet 已提交
214
#ifdef CONFIG_BCACHE_DEBUG
215
	iter->b = &b->keys;
K
Kent Overstreet 已提交
216 217
#endif

K
Kent Overstreet 已提交
218
	if (!i->seq)
K
Kent Overstreet 已提交
219 220 221
		goto err;

	for (;
K
Kent Overstreet 已提交
222
	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
K
Kent Overstreet 已提交
223 224 225 226 227 228
	     i = write_block(b)) {
		err = "unsupported bset version";
		if (i->version > BCACHE_BSET_VERSION)
			goto err;

		err = "bad btree header";
229 230
		if (b->written + set_blocks(i, block_bytes(b->c)) >
		    btree_blocks(b))
K
Kent Overstreet 已提交
231 232 233
			goto err;

		err = "bad magic";
234
		if (i->magic != bset_magic(&b->c->sb))
K
Kent Overstreet 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
			goto err;

		err = "bad checksum";
		switch (i->version) {
		case 0:
			if (i->csum != csum_set(i))
				goto err;
			break;
		case BCACHE_BSET_VERSION:
			if (i->csum != btree_csum_set(b, i))
				goto err;
			break;
		}

		err = "empty set";
K
Kent Overstreet 已提交
250
		if (i != b->keys.set[0].data && !i->keys)
K
Kent Overstreet 已提交
251 252
			goto err;

K
Kent Overstreet 已提交
253
		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
K
Kent Overstreet 已提交
254

255
		b->written += set_blocks(i, block_bytes(b->c));
K
Kent Overstreet 已提交
256 257 258 259
	}

	err = "corrupted btree";
	for (i = write_block(b);
K
Kent Overstreet 已提交
260
	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
K
Kent Overstreet 已提交
261
	     i = ((void *) i) + block_bytes(b->c))
K
Kent Overstreet 已提交
262
		if (i->seq == b->keys.set[0].data->seq)
K
Kent Overstreet 已提交
263 264
			goto err;

K
Kent Overstreet 已提交
265
	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
K
Kent Overstreet 已提交
266

K
Kent Overstreet 已提交
267
	i = b->keys.set[0].data;
K
Kent Overstreet 已提交
268
	err = "short btree key";
K
Kent Overstreet 已提交
269 270
	if (b->keys.set[0].size &&
	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
K
Kent Overstreet 已提交
271 272 273
		goto err;

	if (b->written < btree_blocks(b))
K
Kent Overstreet 已提交
274 275
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));
K
Kent Overstreet 已提交
276
out:
277
	mempool_free(iter, &b->c->fill_iter);
K
Kent Overstreet 已提交
278
	return;
K
Kent Overstreet 已提交
279 280
err:
	set_btree_node_io_error(b);
K
Kent Overstreet 已提交
281
	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
K
Kent Overstreet 已提交
282
			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
K
Kent Overstreet 已提交
283
			    bset_block_offset(b, i), i->keys);
K
Kent Overstreet 已提交
284 285 286
	goto out;
}

287
static void btree_node_read_endio(struct bio *bio)
K
Kent Overstreet 已提交
288
{
K
Kent Overstreet 已提交
289
	struct closure *cl = bio->bi_private;
290

K
Kent Overstreet 已提交
291 292
	closure_put(cl);
}
K
Kent Overstreet 已提交
293

294
static void bch_btree_node_read(struct btree *b)
K
Kent Overstreet 已提交
295 296 297 298
{
	uint64_t start_time = local_clock();
	struct closure cl;
	struct bio *bio;
K
Kent Overstreet 已提交
299

K
Kent Overstreet 已提交
300 301
	trace_bcache_btree_read(b);

K
Kent Overstreet 已提交
302
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
303

K
Kent Overstreet 已提交
304
	bio = bch_bbio_alloc(b->c);
305
	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
K
Kent Overstreet 已提交
306 307
	bio->bi_end_io	= btree_node_read_endio;
	bio->bi_private	= &cl;
308
	bio->bi_opf = REQ_OP_READ | REQ_META;
K
Kent Overstreet 已提交
309

K
Kent Overstreet 已提交
310
	bch_bio_map(bio, b->keys.set[0].data);
K
Kent Overstreet 已提交
311

K
Kent Overstreet 已提交
312 313
	bch_submit_bbio(bio, b->c, &b->key, 0);
	closure_sync(&cl);
K
Kent Overstreet 已提交
314

315
	if (bio->bi_status)
K
Kent Overstreet 已提交
316 317 318 319 320 321 322 323 324 325 326 327
		set_btree_node_io_error(b);

	bch_bbio_free(bio, b->c);

	if (btree_node_io_error(b))
		goto err;

	bch_btree_node_read_done(b);
	bch_time_stats_update(&b->c->btree_read_time, start_time);

	return;
err:
328
	bch_cache_set_error(b->c, "io error reading bucket %zu",
K
Kent Overstreet 已提交
329
			    PTR_BUCKET_NR(b->c, &b->key, 0));
K
Kent Overstreet 已提交
330 331 332 333 334 335
}

static void btree_complete_write(struct btree *b, struct btree_write *w)
{
	if (w->prio_blocked &&
	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
336
		wake_up_allocators(b->c);
K
Kent Overstreet 已提交
337 338 339 340 341 342 343 344 345 346

	if (w->journal) {
		atomic_dec_bug(w->journal);
		__closure_wake_up(&b->c->journal.wait);
	}

	w->prio_blocked	= 0;
	w->journal	= NULL;
}

347 348 349 350 351 352 353
static void btree_node_write_unlock(struct closure *cl)
{
	struct btree *b = container_of(cl, struct btree, io);

	up(&b->io_mutex);
}

K
Kent Overstreet 已提交
354
static void __btree_node_write_done(struct closure *cl)
K
Kent Overstreet 已提交
355
{
356
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
357 358 359 360 361 362 363
	struct btree_write *w = btree_prev_write(b);

	bch_bbio_free(b->bio, b->c);
	b->bio = NULL;
	btree_complete_write(b, w);

	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
364
		schedule_delayed_work(&b->work, 30 * HZ);
K
Kent Overstreet 已提交
365

366
	closure_return_with_destructor(cl, btree_node_write_unlock);
K
Kent Overstreet 已提交
367 368
}

K
Kent Overstreet 已提交
369
static void btree_node_write_done(struct closure *cl)
K
Kent Overstreet 已提交
370
{
371
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
372

373
	bio_free_pages(b->bio);
K
Kent Overstreet 已提交
374
	__btree_node_write_done(cl);
K
Kent Overstreet 已提交
375 376
}

377
static void btree_node_write_endio(struct bio *bio)
K
Kent Overstreet 已提交
378 379
{
	struct closure *cl = bio->bi_private;
380
	struct btree *b = container_of(cl, struct btree, io);
K
Kent Overstreet 已提交
381

382
	if (bio->bi_status)
K
Kent Overstreet 已提交
383 384
		set_btree_node_io_error(b);

385
	bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
K
Kent Overstreet 已提交
386 387 388 389
	closure_put(cl);
}

static void do_btree_node_write(struct btree *b)
K
Kent Overstreet 已提交
390
{
391
	struct closure *cl = &b->io;
392
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
393 394 395 396 397
	BKEY_PADDED(key) k;

	i->version	= BCACHE_BSET_VERSION;
	i->csum		= btree_csum_set(b, i);

K
Kent Overstreet 已提交
398 399 400 401
	BUG_ON(b->bio);
	b->bio = bch_bbio_alloc(b->c);

	b->bio->bi_end_io	= btree_node_write_endio;
K
Kent Overstreet 已提交
402
	b->bio->bi_private	= cl;
403
	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
404
	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
405
	bch_bio_map(b->bio, i);
K
Kent Overstreet 已提交
406

K
Kent Overstreet 已提交
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
	/*
	 * If we're appending to a leaf node, we don't technically need FUA -
	 * this write just needs to be persisted before the next journal write,
	 * which will be marked FLUSH|FUA.
	 *
	 * Similarly if we're writing a new btree root - the pointer is going to
	 * be in the next journal entry.
	 *
	 * But if we're writing a new btree node (that isn't a root) or
	 * appending to a non leaf btree node, we need either FUA or a flush
	 * when we write the parent with the new pointer. FUA is cheaper than a
	 * flush, and writes appending to leaf nodes aren't blocking anything so
	 * just make all btree node writes FUA to keep things sane.
	 */

K
Kent Overstreet 已提交
422
	bkey_copy(&k.key, &b->key);
423
	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
K
Kent Overstreet 已提交
424
		       bset_sector_offset(&b->keys, i));
K
Kent Overstreet 已提交
425

426
	if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
K
Kent Overstreet 已提交
427 428 429 430
		int j;
		struct bio_vec *bv;
		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));

431
		bio_for_each_segment_all(bv, b->bio, j)
K
Kent Overstreet 已提交
432 433 434 435 436
			memcpy(page_address(bv->bv_page),
			       base + j * PAGE_SIZE, PAGE_SIZE);

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

K
Kent Overstreet 已提交
437
		continue_at(cl, btree_node_write_done, NULL);
K
Kent Overstreet 已提交
438
	} else {
439
		/* No problem for multipage bvec since the bio is just allocated */
K
Kent Overstreet 已提交
440
		b->bio->bi_vcnt = 0;
441
		bch_bio_map(b->bio, i);
K
Kent Overstreet 已提交
442 443 444 445

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

		closure_sync(cl);
446
		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
K
Kent Overstreet 已提交
447 448 449
	}
}

K
Kent Overstreet 已提交
450
void __bch_btree_node_write(struct btree *b, struct closure *parent)
K
Kent Overstreet 已提交
451
{
452
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
453

K
Kent Overstreet 已提交
454 455
	lockdep_assert_held(&b->write_lock);

K
Kent Overstreet 已提交
456 457
	trace_bcache_btree_write(b);

K
Kent Overstreet 已提交
458
	BUG_ON(current->bio_list);
K
Kent Overstreet 已提交
459 460
	BUG_ON(b->written >= btree_blocks(b));
	BUG_ON(b->written && !i->keys);
461
	BUG_ON(btree_bset_first(b)->seq != i->seq);
462
	bch_check_keys(&b->keys, "writing");
K
Kent Overstreet 已提交
463 464 465

	cancel_delayed_work(&b->work);

K
Kent Overstreet 已提交
466
	/* If caller isn't waiting for write, parent refcount is cache set */
467 468
	down(&b->io_mutex);
	closure_init(&b->io, parent ?: &b->c->cl);
K
Kent Overstreet 已提交
469

K
Kent Overstreet 已提交
470 471 472
	clear_bit(BTREE_NODE_dirty,	 &b->flags);
	change_bit(BTREE_NODE_write_idx, &b->flags);

K
Kent Overstreet 已提交
473
	do_btree_node_write(b);
K
Kent Overstreet 已提交
474

475
	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
K
Kent Overstreet 已提交
476 477
			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);

K
Kent Overstreet 已提交
478
	b->written += set_blocks(i, block_bytes(b->c));
K
Kent Overstreet 已提交
479
}
K
Kent Overstreet 已提交
480

K
Kent Overstreet 已提交
481 482
void bch_btree_node_write(struct btree *b, struct closure *parent)
{
483
	unsigned int nsets = b->keys.nsets;
K
Kent Overstreet 已提交
484 485 486 487

	lockdep_assert_held(&b->lock);

	__bch_btree_node_write(b, parent);
K
Kent Overstreet 已提交
488

489 490 491 492
	/*
	 * do verify if there was more than one set initially (i.e. we did a
	 * sort) and we sorted down to a single set:
	 */
K
Kent Overstreet 已提交
493
	if (nsets && !b->keys.nsets)
494 495
		bch_btree_verify(b);

K
Kent Overstreet 已提交
496
	bch_btree_init_next(b);
K
Kent Overstreet 已提交
497 498
}

499 500 501 502 503
static void bch_btree_node_write_sync(struct btree *b)
{
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
504 505

	mutex_lock(&b->write_lock);
506
	bch_btree_node_write(b, &cl);
K
Kent Overstreet 已提交
507 508
	mutex_unlock(&b->write_lock);

509 510 511
	closure_sync(&cl);
}

K
Kent Overstreet 已提交
512
static void btree_node_write_work(struct work_struct *w)
K
Kent Overstreet 已提交
513 514 515
{
	struct btree *b = container_of(to_delayed_work(w), struct btree, work);

K
Kent Overstreet 已提交
516
	mutex_lock(&b->write_lock);
K
Kent Overstreet 已提交
517
	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
518 519
		__bch_btree_node_write(b, NULL);
	mutex_unlock(&b->write_lock);
K
Kent Overstreet 已提交
520 521
}

K
Kent Overstreet 已提交
522
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
K
Kent Overstreet 已提交
523
{
524
	struct bset *i = btree_bset_last(b);
K
Kent Overstreet 已提交
525 526
	struct btree_write *w = btree_current_write(b);

K
Kent Overstreet 已提交
527 528
	lockdep_assert_held(&b->write_lock);

K
Kent Overstreet 已提交
529 530
	BUG_ON(!b->written);
	BUG_ON(!i->keys);
K
Kent Overstreet 已提交
531

K
Kent Overstreet 已提交
532
	if (!btree_node_dirty(b))
K
Kent Overstreet 已提交
533
		schedule_delayed_work(&b->work, 30 * HZ);
K
Kent Overstreet 已提交
534

K
Kent Overstreet 已提交
535
	set_btree_node_dirty(b);
K
Kent Overstreet 已提交
536

K
Kent Overstreet 已提交
537
	if (journal_ref) {
K
Kent Overstreet 已提交
538
		if (w->journal &&
K
Kent Overstreet 已提交
539
		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
K
Kent Overstreet 已提交
540 541 542 543 544
			atomic_dec_bug(w->journal);
			w->journal = NULL;
		}

		if (!w->journal) {
K
Kent Overstreet 已提交
545
			w->journal = journal_ref;
K
Kent Overstreet 已提交
546 547 548 549 550
			atomic_inc(w->journal);
		}
	}

	/* Force write if set is too big */
K
Kent Overstreet 已提交
551 552 553
	if (set_bytes(i) > PAGE_SIZE - 48 &&
	    !current->bio_list)
		bch_btree_node_write(b, NULL);
K
Kent Overstreet 已提交
554 555 556 557 558 559 560 561 562 563
}

/*
 * Btree in memory cache - allocation/freeing
 * mca -> memory cache
 */

#define mca_reserve(c)	(((c->root && c->root->level)		\
			  ? c->root->level : 1) * 8 + 16)
#define mca_can_free(c)						\
564
	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
K
Kent Overstreet 已提交
565 566 567

static void mca_data_free(struct btree *b)
{
568
	BUG_ON(b->io_mutex.count != 1);
K
Kent Overstreet 已提交
569

K
Kent Overstreet 已提交
570
	bch_btree_keys_free(&b->keys);
K
Kent Overstreet 已提交
571

572
	b->c->btree_cache_used--;
573
	list_move(&b->list, &b->c->btree_cache_freed);
K
Kent Overstreet 已提交
574 575 576 577 578 579 580 581 582 583 584
}

static void mca_bucket_free(struct btree *b)
{
	BUG_ON(btree_node_dirty(b));

	b->key.ptr[0] = 0;
	hlist_del_init_rcu(&b->hash);
	list_move(&b->list, &b->c->btree_cache_freeable);
}

585
static unsigned int btree_order(struct bkey *k)
K
Kent Overstreet 已提交
586 587 588 589 590 591
{
	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
}

static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{
K
Kent Overstreet 已提交
592
	if (!bch_btree_keys_alloc(&b->keys,
593
				  max_t(unsigned int,
594 595 596
					ilog2(b->c->btree_pages),
					btree_order(k)),
				  gfp)) {
597
		b->c->btree_cache_used++;
598 599 600 601
		list_move(&b->list, &b->c->btree_cache);
	} else {
		list_move(&b->list, &b->c->btree_cache_freed);
	}
K
Kent Overstreet 已提交
602 603 604 605 606 607
}

static struct btree *mca_bucket_alloc(struct cache_set *c,
				      struct bkey *k, gfp_t gfp)
{
	struct btree *b = kzalloc(sizeof(struct btree), gfp);
608

K
Kent Overstreet 已提交
609 610 611 612 613
	if (!b)
		return NULL;

	init_rwsem(&b->lock);
	lockdep_set_novalidate_class(&b->lock);
K
Kent Overstreet 已提交
614 615
	mutex_init(&b->write_lock);
	lockdep_set_novalidate_class(&b->write_lock);
K
Kent Overstreet 已提交
616
	INIT_LIST_HEAD(&b->list);
K
Kent Overstreet 已提交
617
	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
K
Kent Overstreet 已提交
618
	b->c = c;
619
	sema_init(&b->io_mutex, 1);
K
Kent Overstreet 已提交
620 621 622 623 624

	mca_data_alloc(b, k, gfp);
	return b;
}

625
static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
K
Kent Overstreet 已提交
626
{
627 628 629
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
630 631 632 633 634
	lockdep_assert_held(&b->c->bucket_lock);

	if (!down_write_trylock(&b->lock))
		return -ENOMEM;

K
Kent Overstreet 已提交
635
	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
636

K
Kent Overstreet 已提交
637
	if (b->keys.page_order < min_order)
638 639 640 641 642 643 644 645 646
		goto out_unlock;

	if (!flush) {
		if (btree_node_dirty(b))
			goto out_unlock;

		if (down_trylock(&b->io_mutex))
			goto out_unlock;
		up(&b->io_mutex);
K
Kent Overstreet 已提交
647 648
	}

K
Kent Overstreet 已提交
649
	mutex_lock(&b->write_lock);
650
	if (btree_node_dirty(b))
K
Kent Overstreet 已提交
651 652 653 654
		__bch_btree_node_write(b, &cl);
	mutex_unlock(&b->write_lock);

	closure_sync(&cl);
K
Kent Overstreet 已提交
655

656
	/* wait for any in flight btree write */
657 658
	down(&b->io_mutex);
	up(&b->io_mutex);
659

K
Kent Overstreet 已提交
660
	return 0;
661 662 663
out_unlock:
	rw_unlock(true, b);
	return -ENOMEM;
K
Kent Overstreet 已提交
664 665
}

666 667
static unsigned long bch_mca_scan(struct shrinker *shrink,
				  struct shrink_control *sc)
K
Kent Overstreet 已提交
668 669 670 671
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
	struct btree *b, *t;
	unsigned long i, nr = sc->nr_to_scan;
672
	unsigned long freed = 0;
673
	unsigned int btree_cache_used;
K
Kent Overstreet 已提交
674 675

	if (c->shrinker_disabled)
676
		return SHRINK_STOP;
K
Kent Overstreet 已提交
677

678
	if (c->btree_cache_alloc_lock)
679
		return SHRINK_STOP;
K
Kent Overstreet 已提交
680 681

	/* Return -1 if we can't do anything right now */
682
	if (sc->gfp_mask & __GFP_IO)
K
Kent Overstreet 已提交
683 684 685 686
		mutex_lock(&c->bucket_lock);
	else if (!mutex_trylock(&c->bucket_lock))
		return -1;

687 688 689 690 691 692 693
	/*
	 * It's _really_ critical that we don't free too many btree nodes - we
	 * have to always leave ourselves a reserve. The reserve is how we
	 * guarantee that allocating memory for a new btree node can always
	 * succeed, so that inserting keys into the btree can always succeed and
	 * IO can always make forward progress:
	 */
K
Kent Overstreet 已提交
694 695 696 697
	nr /= c->btree_pages;
	nr = min_t(unsigned long, nr, mca_can_free(c));

	i = 0;
698
	btree_cache_used = c->btree_cache_used;
K
Kent Overstreet 已提交
699
	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
700 701
		if (nr <= 0)
			goto out;
K
Kent Overstreet 已提交
702 703

		if (++i > 3 &&
704
		    !mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
705 706
			mca_data_free(b);
			rw_unlock(true, b);
707
			freed++;
K
Kent Overstreet 已提交
708
		}
709
		nr--;
K
Kent Overstreet 已提交
710 711
	}

712
	for (;  (nr--) && i < btree_cache_used; i++) {
K
Kent Overstreet 已提交
713 714 715
		if (list_empty(&c->btree_cache))
			goto out;

K
Kent Overstreet 已提交
716 717 718 719
		b = list_first_entry(&c->btree_cache, struct btree, list);
		list_rotate_left(&c->btree_cache);

		if (!b->accessed &&
720
		    !mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
721 722 723
			mca_bucket_free(b);
			mca_data_free(b);
			rw_unlock(true, b);
724
			freed++;
K
Kent Overstreet 已提交
725 726 727 728 729
		} else
			b->accessed = 0;
	}
out:
	mutex_unlock(&c->bucket_lock);
730
	return freed * c->btree_pages;
731 732 733 734 735 736 737 738 739 740
}

static unsigned long bch_mca_count(struct shrinker *shrink,
				   struct shrink_control *sc)
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);

	if (c->shrinker_disabled)
		return 0;

741
	if (c->btree_cache_alloc_lock)
742 743 744
		return 0;

	return mca_can_free(c) * c->btree_pages;
K
Kent Overstreet 已提交
745 746 747 748 749 750
}

void bch_btree_cache_free(struct cache_set *c)
{
	struct btree *b;
	struct closure cl;
751

K
Kent Overstreet 已提交
752 753 754 755 756 757 758 759 760 761
	closure_init_stack(&cl);

	if (c->shrink.list.next)
		unregister_shrinker(&c->shrink);

	mutex_lock(&c->bucket_lock);

#ifdef CONFIG_BCACHE_DEBUG
	if (c->verify_data)
		list_move(&c->verify_data->list, &c->btree_cache);
762 763

	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
K
Kent Overstreet 已提交
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
#endif

	list_splice(&c->btree_cache_freeable,
		    &c->btree_cache);

	while (!list_empty(&c->btree_cache)) {
		b = list_first_entry(&c->btree_cache, struct btree, list);

		if (btree_node_dirty(b))
			btree_complete_write(b, btree_current_write(b));
		clear_bit(BTREE_NODE_dirty, &b->flags);

		mca_data_free(b);
	}

	while (!list_empty(&c->btree_cache_freed)) {
		b = list_first_entry(&c->btree_cache_freed,
				     struct btree, list);
		list_del(&b->list);
		cancel_delayed_work_sync(&b->work);
		kfree(b);
	}

	mutex_unlock(&c->bucket_lock);
}

int bch_btree_cache_alloc(struct cache_set *c)
{
792
	unsigned int i;
K
Kent Overstreet 已提交
793 794

	for (i = 0; i < mca_reserve(c); i++)
K
Kent Overstreet 已提交
795 796
		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
			return -ENOMEM;
K
Kent Overstreet 已提交
797 798 799 800 801 802 803

	list_splice_init(&c->btree_cache,
			 &c->btree_cache_freeable);

#ifdef CONFIG_BCACHE_DEBUG
	mutex_init(&c->verify_lock);

804 805 806
	c->verify_ondisk = (void *)
		__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));

K
Kent Overstreet 已提交
807 808 809
	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);

	if (c->verify_data &&
K
Kent Overstreet 已提交
810
	    c->verify_data->keys.set->data)
K
Kent Overstreet 已提交
811 812 813 814 815
		list_del_init(&c->verify_data->list);
	else
		c->verify_data = NULL;
#endif

816 817
	c->shrink.count_objects = bch_mca_count;
	c->shrink.scan_objects = bch_mca_scan;
K
Kent Overstreet 已提交
818 819
	c->shrink.seeks = 4;
	c->shrink.batch = c->btree_pages * 2;
820 821 822 823

	if (register_shrinker(&c->shrink))
		pr_warn("bcache: %s: could not register shrinker",
				__func__);
K
Kent Overstreet 已提交
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848

	return 0;
}

/* Btree in memory cache - hash table */

static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
{
	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
}

static struct btree *mca_find(struct cache_set *c, struct bkey *k)
{
	struct btree *b;

	rcu_read_lock();
	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
			goto out;
	b = NULL;
out:
	rcu_read_unlock();
	return b;
}

849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
	struct task_struct *old;

	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
	if (old && old != current) {
		if (op)
			prepare_to_wait(&c->btree_cache_wait, &op->wait,
					TASK_UNINTERRUPTIBLE);
		return -EINTR;
	}

	return 0;
}

static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
				     struct bkey *k)
K
Kent Overstreet 已提交
866
{
867
	struct btree *b;
K
Kent Overstreet 已提交
868

K
Kent Overstreet 已提交
869 870
	trace_bcache_btree_cache_cannibalize(c);

871 872
	if (mca_cannibalize_lock(c, op))
		return ERR_PTR(-EINTR);
K
Kent Overstreet 已提交
873

874 875 876
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), false))
			return b;
K
Kent Overstreet 已提交
877

878 879 880
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), true))
			return b;
K
Kent Overstreet 已提交
881

882
	WARN(1, "btree cache cannibalize failed\n");
883
	return ERR_PTR(-ENOMEM);
K
Kent Overstreet 已提交
884 885 886 887 888 889 890 891
}

/*
 * We can only have one thread cannibalizing other cached btree nodes at a time,
 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 * cannibalize_bucket() will take. This means every time we unlock the root of
 * the btree, we need to release this lock if we have it held.
 */
892
static void bch_cannibalize_unlock(struct cache_set *c)
K
Kent Overstreet 已提交
893
{
894 895 896
	if (c->btree_cache_alloc_lock == current) {
		c->btree_cache_alloc_lock = NULL;
		wake_up(&c->btree_cache_wait);
K
Kent Overstreet 已提交
897 898 899
	}
}

900 901
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
			       struct bkey *k, int level)
K
Kent Overstreet 已提交
902 903 904
{
	struct btree *b;

905 906
	BUG_ON(current->bio_list);

K
Kent Overstreet 已提交
907 908 909 910 911 912 913 914 915
	lockdep_assert_held(&c->bucket_lock);

	if (mca_find(c, k))
		return NULL;

	/* btree_free() doesn't free memory; it sticks the node on the end of
	 * the list. Check if there's any freed nodes there:
	 */
	list_for_each_entry(b, &c->btree_cache_freeable, list)
916
		if (!mca_reap(b, btree_order(k), false))
K
Kent Overstreet 已提交
917 918 919 920 921 922
			goto out;

	/* We never free struct btree itself, just the memory that holds the on
	 * disk node. Check the freed list before allocating a new one:
	 */
	list_for_each_entry(b, &c->btree_cache_freed, list)
923
		if (!mca_reap(b, 0, false)) {
K
Kent Overstreet 已提交
924
			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
K
Kent Overstreet 已提交
925
			if (!b->keys.set[0].data)
K
Kent Overstreet 已提交
926 927 928 929 930 931 932 933 934 935
				goto err;
			else
				goto out;
		}

	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
	if (!b)
		goto err;

	BUG_ON(!down_write_trylock(&b->lock));
K
Kent Overstreet 已提交
936
	if (!b->keys.set->data)
K
Kent Overstreet 已提交
937 938
		goto err;
out:
939
	BUG_ON(b->io_mutex.count != 1);
K
Kent Overstreet 已提交
940 941 942 943 944 945 946

	bkey_copy(&b->key, k);
	list_move(&b->list, &c->btree_cache);
	hlist_del_init_rcu(&b->hash);
	hlist_add_head_rcu(&b->hash, mca_hash(c, k));

	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
947
	b->parent	= (void *) ~0UL;
K
Kent Overstreet 已提交
948 949 950
	b->flags	= 0;
	b->written	= 0;
	b->level	= level;
K
Kent Overstreet 已提交
951

952
	if (!b->level)
K
Kent Overstreet 已提交
953 954
		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
				    &b->c->expensive_debug_checks);
955
	else
K
Kent Overstreet 已提交
956 957
		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
				    &b->c->expensive_debug_checks);
K
Kent Overstreet 已提交
958 959 960 961 962 963

	return b;
err:
	if (b)
		rw_unlock(true, b);

964
	b = mca_cannibalize(c, op, k);
K
Kent Overstreet 已提交
965 966 967 968 969 970
	if (!IS_ERR(b))
		goto out;

	return b;
}

971
/*
K
Kent Overstreet 已提交
972 973 974
 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 * in from disk if necessary.
 *
K
Kent Overstreet 已提交
975
 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
K
Kent Overstreet 已提交
976 977 978 979
 *
 * The btree node will have either a read or a write lock held, depending on
 * level and op->lock.
 */
980
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
981 982
				 struct bkey *k, int level, bool write,
				 struct btree *parent)
K
Kent Overstreet 已提交
983 984 985 986 987 988 989 990 991
{
	int i = 0;
	struct btree *b;

	BUG_ON(level < 0);
retry:
	b = mca_find(c, k);

	if (!b) {
K
Kent Overstreet 已提交
992 993 994
		if (current->bio_list)
			return ERR_PTR(-EAGAIN);

K
Kent Overstreet 已提交
995
		mutex_lock(&c->bucket_lock);
996
		b = mca_alloc(c, op, k, level);
K
Kent Overstreet 已提交
997 998 999 1000 1001 1002 1003
		mutex_unlock(&c->bucket_lock);

		if (!b)
			goto retry;
		if (IS_ERR(b))
			return b;

K
Kent Overstreet 已提交
1004
		bch_btree_node_read(b);
K
Kent Overstreet 已提交
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016

		if (!write)
			downgrade_write(&b->lock);
	} else {
		rw_lock(write, b, level);
		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
			rw_unlock(write, b);
			goto retry;
		}
		BUG_ON(b->level != level);
	}

1017 1018 1019 1020 1021 1022 1023
	if (btree_node_io_error(b)) {
		rw_unlock(write, b);
		return ERR_PTR(-EIO);
	}

	BUG_ON(!b->written);

1024
	b->parent = parent;
K
Kent Overstreet 已提交
1025 1026
	b->accessed = 1;

K
Kent Overstreet 已提交
1027 1028 1029
	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
		prefetch(b->keys.set[i].tree);
		prefetch(b->keys.set[i].data);
K
Kent Overstreet 已提交
1030 1031
	}

K
Kent Overstreet 已提交
1032 1033
	for (; i <= b->keys.nsets; i++)
		prefetch(b->keys.set[i].data);
K
Kent Overstreet 已提交
1034 1035 1036 1037

	return b;
}

1038
static void btree_node_prefetch(struct btree *parent, struct bkey *k)
K
Kent Overstreet 已提交
1039 1040 1041
{
	struct btree *b;

1042 1043 1044
	mutex_lock(&parent->c->bucket_lock);
	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
	mutex_unlock(&parent->c->bucket_lock);
K
Kent Overstreet 已提交
1045 1046

	if (!IS_ERR_OR_NULL(b)) {
1047
		b->parent = parent;
K
Kent Overstreet 已提交
1048
		bch_btree_node_read(b);
K
Kent Overstreet 已提交
1049 1050 1051 1052 1053 1054
		rw_unlock(true, b);
	}
}

/* Btree alloc */

1055
static void btree_node_free(struct btree *b)
K
Kent Overstreet 已提交
1056
{
K
Kent Overstreet 已提交
1057 1058
	trace_bcache_btree_node_free(b);

K
Kent Overstreet 已提交
1059 1060
	BUG_ON(b == b->c->root);

K
Kent Overstreet 已提交
1061 1062
	mutex_lock(&b->write_lock);

K
Kent Overstreet 已提交
1063 1064 1065 1066
	if (btree_node_dirty(b))
		btree_complete_write(b, btree_current_write(b));
	clear_bit(BTREE_NODE_dirty, &b->flags);

K
Kent Overstreet 已提交
1067 1068
	mutex_unlock(&b->write_lock);

K
Kent Overstreet 已提交
1069 1070 1071 1072 1073 1074 1075 1076
	cancel_delayed_work(&b->work);

	mutex_lock(&b->c->bucket_lock);
	bch_bucket_free(b->c, &b->key);
	mca_bucket_free(b);
	mutex_unlock(&b->c->bucket_lock);
}

1077
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1078 1079
				     int level, bool wait,
				     struct btree *parent)
K
Kent Overstreet 已提交
1080 1081 1082 1083 1084 1085
{
	BKEY_PADDED(key) k;
	struct btree *b = ERR_PTR(-EAGAIN);

	mutex_lock(&c->bucket_lock);
retry:
1086
	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
K
Kent Overstreet 已提交
1087 1088
		goto err;

1089
	bkey_put(c, &k.key);
K
Kent Overstreet 已提交
1090 1091
	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);

1092
	b = mca_alloc(c, op, &k.key, level);
K
Kent Overstreet 已提交
1093 1094 1095 1096
	if (IS_ERR(b))
		goto err_free;

	if (!b) {
K
Kent Overstreet 已提交
1097 1098
		cache_bug(c,
			"Tried to allocate bucket that was in btree cache");
K
Kent Overstreet 已提交
1099 1100 1101 1102
		goto retry;
	}

	b->accessed = 1;
1103
	b->parent = parent;
K
Kent Overstreet 已提交
1104
	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
K
Kent Overstreet 已提交
1105 1106

	mutex_unlock(&c->bucket_lock);
K
Kent Overstreet 已提交
1107 1108

	trace_bcache_btree_node_alloc(b);
K
Kent Overstreet 已提交
1109 1110 1111 1112 1113
	return b;
err_free:
	bch_bucket_free(c, &k.key);
err:
	mutex_unlock(&c->bucket_lock);
K
Kent Overstreet 已提交
1114

1115
	trace_bcache_btree_node_alloc_fail(c);
K
Kent Overstreet 已提交
1116 1117 1118
	return b;
}

1119
static struct btree *bch_btree_node_alloc(struct cache_set *c,
1120 1121
					  struct btree_op *op, int level,
					  struct btree *parent)
1122
{
1123
	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1124 1125
}

1126 1127
static struct btree *btree_node_alloc_replacement(struct btree *b,
						  struct btree_op *op)
K
Kent Overstreet 已提交
1128
{
1129
	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1130

1131
	if (!IS_ERR_OR_NULL(n)) {
K
Kent Overstreet 已提交
1132
		mutex_lock(&n->write_lock);
1133
		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1134
		bkey_copy_key(&n->key, &b->key);
K
Kent Overstreet 已提交
1135
		mutex_unlock(&n->write_lock);
1136
	}
K
Kent Overstreet 已提交
1137 1138 1139 1140

	return n;
}

1141 1142
static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{
1143
	unsigned int i;
1144

1145 1146 1147 1148
	mutex_lock(&b->c->bucket_lock);

	atomic_inc(&b->c->prio_blocked);

1149 1150 1151
	bkey_copy(k, &b->key);
	bkey_copy_key(k, &ZERO_KEY);

1152 1153 1154 1155
	for (i = 0; i < KEY_PTRS(k); i++)
		SET_PTR_GEN(k, i,
			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
					PTR_BUCKET(b->c, &b->key, i)));
1156

1157
	mutex_unlock(&b->c->bucket_lock);
1158 1159
}

1160 1161 1162 1163
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
	struct cache_set *c = b->c;
	struct cache *ca;
1164
	unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
1165 1166 1167 1168 1169 1170

	mutex_lock(&c->bucket_lock);

	for_each_cache(ca, c, i)
		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
			if (op)
1171
				prepare_to_wait(&c->btree_cache_wait, &op->wait,
1172
						TASK_UNINTERRUPTIBLE);
1173 1174
			mutex_unlock(&c->bucket_lock);
			return -EINTR;
1175 1176 1177
		}

	mutex_unlock(&c->bucket_lock);
1178 1179

	return mca_cannibalize_lock(b->c, op);
1180 1181
}

K
Kent Overstreet 已提交
1182 1183
/* Garbage collection */

1184 1185
static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
				    struct bkey *k)
K
Kent Overstreet 已提交
1186 1187
{
	uint8_t stale = 0;
1188
	unsigned int i;
K
Kent Overstreet 已提交
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204
	struct bucket *g;

	/*
	 * ptr_invalid() can't return true for the keys that mark btree nodes as
	 * freed, but since ptr_bad() returns true we'll never actually use them
	 * for anything and thus we don't want mark their pointers here
	 */
	if (!bkey_cmp(k, &ZERO_KEY))
		return stale;

	for (i = 0; i < KEY_PTRS(k); i++) {
		if (!ptr_available(c, k, i))
			continue;

		g = PTR_BUCKET(c, k, i);

K
Kent Overstreet 已提交
1205 1206
		if (gen_after(g->last_gc, PTR_GEN(k, i)))
			g->last_gc = PTR_GEN(k, i);
K
Kent Overstreet 已提交
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221

		if (ptr_stale(c, k, i)) {
			stale = max(stale, ptr_stale(c, k, i));
			continue;
		}

		cache_bug_on(GC_MARK(g) &&
			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
			     c, "inconsistent ptrs: mark = %llu, level = %i",
			     GC_MARK(g), level);

		if (level)
			SET_GC_MARK(g, GC_MARK_METADATA);
		else if (KEY_DIRTY(k))
			SET_GC_MARK(g, GC_MARK_DIRTY);
1222 1223
		else if (!GC_MARK(g))
			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
K
Kent Overstreet 已提交
1224 1225

		/* guard against overflow */
1226
		SET_GC_SECTORS_USED(g, min_t(unsigned int,
K
Kent Overstreet 已提交
1227
					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1228
					     MAX_GC_SECTORS_USED));
K
Kent Overstreet 已提交
1229 1230 1231 1232 1233 1234 1235 1236 1237

		BUG_ON(!GC_SECTORS_USED(g));
	}

	return stale;
}

#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)

1238 1239
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
{
1240
	unsigned int i;
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i) &&
		    !ptr_stale(c, k, i)) {
			struct bucket *b = PTR_BUCKET(c, k, i);

			b->gen = PTR_GEN(k, i);

			if (level && bkey_cmp(k, &ZERO_KEY))
				b->prio = BTREE_PRIO;
			else if (!level && b->prio == BTREE_PRIO)
				b->prio = INITIAL_PRIO;
		}

	__bch_btree_mark_key(c, level, k);
}

1258 1259 1260 1261 1262
void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
{
	stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
}

K
Kent Overstreet 已提交
1263
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
K
Kent Overstreet 已提交
1264 1265
{
	uint8_t stale = 0;
1266
	unsigned int keys = 0, good_keys = 0;
K
Kent Overstreet 已提交
1267 1268 1269 1270 1271 1272
	struct bkey *k;
	struct btree_iter iter;
	struct bset_tree *t;

	gc->nodes++;

1273
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
K
Kent Overstreet 已提交
1274
		stale = max(stale, btree_mark_key(b, k));
K
Kent Overstreet 已提交
1275
		keys++;
K
Kent Overstreet 已提交
1276

K
Kent Overstreet 已提交
1277
		if (bch_ptr_bad(&b->keys, k))
K
Kent Overstreet 已提交
1278 1279 1280 1281
			continue;

		gc->key_bytes += bkey_u64s(k);
		gc->nkeys++;
K
Kent Overstreet 已提交
1282
		good_keys++;
K
Kent Overstreet 已提交
1283 1284 1285 1286

		gc->data += KEY_SIZE(k);
	}

K
Kent Overstreet 已提交
1287
	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
K
Kent Overstreet 已提交
1288
		btree_bug_on(t->size &&
K
Kent Overstreet 已提交
1289
			     bset_written(&b->keys, t) &&
K
Kent Overstreet 已提交
1290 1291 1292
			     bkey_cmp(&b->key, &t->end) < 0,
			     b, "found short btree key in gc");

K
Kent Overstreet 已提交
1293 1294
	if (b->c->gc_always_rewrite)
		return true;
K
Kent Overstreet 已提交
1295

K
Kent Overstreet 已提交
1296 1297
	if (stale > 10)
		return true;
K
Kent Overstreet 已提交
1298

K
Kent Overstreet 已提交
1299 1300
	if ((keys - good_keys) * 2 > keys)
		return true;
K
Kent Overstreet 已提交
1301

K
Kent Overstreet 已提交
1302
	return false;
K
Kent Overstreet 已提交
1303 1304
}

K
Kent Overstreet 已提交
1305
#define GC_MERGE_NODES	4U
K
Kent Overstreet 已提交
1306 1307 1308

struct gc_merge_info {
	struct btree	*b;
1309
	unsigned int	keys;
K
Kent Overstreet 已提交
1310 1311
};

K
Kent Overstreet 已提交
1312 1313 1314 1315
static int bch_btree_insert_node(struct btree *, struct btree_op *,
				 struct keylist *, atomic_t *, struct bkey *);

static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1316
			     struct gc_stat *gc, struct gc_merge_info *r)
K
Kent Overstreet 已提交
1317
{
1318
	unsigned int i, nodes = 0, keys = 0, blocks;
K
Kent Overstreet 已提交
1319
	struct btree *new_nodes[GC_MERGE_NODES];
1320
	struct keylist keylist;
K
Kent Overstreet 已提交
1321
	struct closure cl;
K
Kent Overstreet 已提交
1322
	struct bkey *k;
K
Kent Overstreet 已提交
1323

1324 1325 1326 1327 1328
	bch_keylist_init(&keylist);

	if (btree_check_reserve(b, NULL))
		return 0;

K
Kent Overstreet 已提交
1329
	memset(new_nodes, 0, sizeof(new_nodes));
K
Kent Overstreet 已提交
1330
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
1331

K
Kent Overstreet 已提交
1332
	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
K
Kent Overstreet 已提交
1333 1334 1335 1336 1337
		keys += r[nodes++].keys;

	blocks = btree_default_blocks(b->c) * 2 / 3;

	if (nodes < 2 ||
K
Kent Overstreet 已提交
1338
	    __set_blocks(b->keys.set[0].data, keys,
1339
			 block_bytes(b->c)) > blocks * (nodes - 1))
K
Kent Overstreet 已提交
1340
		return 0;
K
Kent Overstreet 已提交
1341

K
Kent Overstreet 已提交
1342
	for (i = 0; i < nodes; i++) {
1343
		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
K
Kent Overstreet 已提交
1344 1345
		if (IS_ERR_OR_NULL(new_nodes[i]))
			goto out_nocoalesce;
K
Kent Overstreet 已提交
1346 1347
	}

1348 1349 1350 1351 1352 1353 1354 1355 1356
	/*
	 * We have to check the reserve here, after we've allocated our new
	 * nodes, to make sure the insert below will succeed - we also check
	 * before as an optimization to potentially avoid a bunch of expensive
	 * allocs/sorts
	 */
	if (btree_check_reserve(b, NULL))
		goto out_nocoalesce;

K
Kent Overstreet 已提交
1357 1358 1359
	for (i = 0; i < nodes; i++)
		mutex_lock(&new_nodes[i]->write_lock);

K
Kent Overstreet 已提交
1360
	for (i = nodes - 1; i > 0; --i) {
1361 1362
		struct bset *n1 = btree_bset_first(new_nodes[i]);
		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
K
Kent Overstreet 已提交
1363 1364 1365 1366
		struct bkey *k, *last = NULL;

		keys = 0;

K
Kent Overstreet 已提交
1367 1368
		if (i > 1) {
			for (k = n2->start;
K
Kent Overstreet 已提交
1369
			     k < bset_bkey_last(n2);
K
Kent Overstreet 已提交
1370 1371
			     k = bkey_next(k)) {
				if (__set_blocks(n1, n1->keys + keys +
1372 1373
						 bkey_u64s(k),
						 block_bytes(b->c)) > blocks)
K
Kent Overstreet 已提交
1374 1375 1376 1377 1378 1379
					break;

				last = k;
				keys += bkey_u64s(k);
			}
		} else {
K
Kent Overstreet 已提交
1380 1381 1382 1383 1384 1385 1386 1387
			/*
			 * Last node we're not getting rid of - we're getting
			 * rid of the node at r[0]. Have to try and fit all of
			 * the remaining keys into this node; we can't ensure
			 * they will always fit due to rounding and variable
			 * length keys (shouldn't be possible in practice,
			 * though)
			 */
K
Kent Overstreet 已提交
1388
			if (__set_blocks(n1, n1->keys + n2->keys,
1389 1390
					 block_bytes(b->c)) >
			    btree_blocks(new_nodes[i]))
K
Kent Overstreet 已提交
1391
				goto out_nocoalesce;
K
Kent Overstreet 已提交
1392 1393

			keys = n2->keys;
K
Kent Overstreet 已提交
1394
			/* Take the key of the node we're getting rid of */
K
Kent Overstreet 已提交
1395
			last = &r->b->key;
K
Kent Overstreet 已提交
1396
		}
K
Kent Overstreet 已提交
1397

1398 1399
		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
		       btree_blocks(new_nodes[i]));
K
Kent Overstreet 已提交
1400

K
Kent Overstreet 已提交
1401 1402
		if (last)
			bkey_copy_key(&new_nodes[i]->key, last);
K
Kent Overstreet 已提交
1403

K
Kent Overstreet 已提交
1404
		memcpy(bset_bkey_last(n1),
K
Kent Overstreet 已提交
1405
		       n2->start,
K
Kent Overstreet 已提交
1406
		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
K
Kent Overstreet 已提交
1407 1408

		n1->keys += keys;
K
Kent Overstreet 已提交
1409
		r[i].keys = n1->keys;
K
Kent Overstreet 已提交
1410 1411

		memmove(n2->start,
K
Kent Overstreet 已提交
1412 1413 1414
			bset_bkey_idx(n2, keys),
			(void *) bset_bkey_last(n2) -
			(void *) bset_bkey_idx(n2, keys));
K
Kent Overstreet 已提交
1415 1416 1417

		n2->keys -= keys;

1418
		if (__bch_keylist_realloc(&keylist,
1419
					  bkey_u64s(&new_nodes[i]->key)))
K
Kent Overstreet 已提交
1420 1421 1422
			goto out_nocoalesce;

		bch_btree_node_write(new_nodes[i], &cl);
1423
		bch_keylist_add(&keylist, &new_nodes[i]->key);
K
Kent Overstreet 已提交
1424 1425
	}

K
Kent Overstreet 已提交
1426 1427 1428
	for (i = 0; i < nodes; i++)
		mutex_unlock(&new_nodes[i]->write_lock);

1429 1430 1431 1432 1433 1434
	closure_sync(&cl);

	/* We emptied out this node */
	BUG_ON(btree_bset_first(new_nodes[0])->keys);
	btree_node_free(new_nodes[0]);
	rw_unlock(true, new_nodes[0]);
1435
	new_nodes[0] = NULL;
1436

K
Kent Overstreet 已提交
1437
	for (i = 0; i < nodes; i++) {
1438
		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
K
Kent Overstreet 已提交
1439
			goto out_nocoalesce;
K
Kent Overstreet 已提交
1440

1441 1442
		make_btree_freeing_key(r[i].b, keylist.top);
		bch_keylist_push(&keylist);
K
Kent Overstreet 已提交
1443
	}
K
Kent Overstreet 已提交
1444

1445 1446
	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
	BUG_ON(!bch_keylist_empty(&keylist));
K
Kent Overstreet 已提交
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458

	for (i = 0; i < nodes; i++) {
		btree_node_free(r[i].b);
		rw_unlock(true, r[i].b);

		r[i].b = new_nodes[i];
	}

	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
	r[nodes - 1].b = ERR_PTR(-EINTR);

	trace_bcache_btree_gc_coalesce(nodes);
K
Kent Overstreet 已提交
1459 1460
	gc->nodes--;

1461 1462
	bch_keylist_free(&keylist);

K
Kent Overstreet 已提交
1463 1464 1465 1466 1467
	/* Invalidated our iterator */
	return -EINTR;

out_nocoalesce:
	closure_sync(&cl);
1468
	bch_keylist_free(&keylist);
K
Kent Overstreet 已提交
1469

1470
	while ((k = bch_keylist_pop(&keylist)))
K
Kent Overstreet 已提交
1471 1472 1473 1474 1475 1476 1477 1478 1479
		if (!bkey_cmp(k, &ZERO_KEY))
			atomic_dec(&b->c->prio_blocked);

	for (i = 0; i < nodes; i++)
		if (!IS_ERR_OR_NULL(new_nodes[i])) {
			btree_node_free(new_nodes[i]);
			rw_unlock(true, new_nodes[i]);
		}
	return 0;
K
Kent Overstreet 已提交
1480 1481
}

1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
				 struct btree *replace)
{
	struct keylist keys;
	struct btree *n;

	if (btree_check_reserve(b, NULL))
		return 0;

	n = btree_node_alloc_replacement(replace, NULL);

	/* recheck reserve after allocating replacement node */
	if (btree_check_reserve(b, NULL)) {
		btree_node_free(n);
		rw_unlock(true, n);
		return 0;
	}

	bch_btree_node_write_sync(n);

	bch_keylist_init(&keys);
	bch_keylist_add(&keys, &n->key);

	make_btree_freeing_key(replace, keys.top);
	bch_keylist_push(&keys);

	bch_btree_insert_node(b, op, &keys, NULL, NULL);
	BUG_ON(!bch_keylist_empty(&keys));

	btree_node_free(replace);
	rw_unlock(true, n);

	/* Invalidated our iterator */
	return -EINTR;
}

1518
static unsigned int btree_gc_count_keys(struct btree *b)
K
Kent Overstreet 已提交
1519
{
K
Kent Overstreet 已提交
1520 1521
	struct bkey *k;
	struct btree_iter iter;
1522
	unsigned int ret = 0;
K
Kent Overstreet 已提交
1523

1524
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
K
Kent Overstreet 已提交
1525 1526 1527 1528
		ret += bkey_u64s(k);

	return ret;
}
K
Kent Overstreet 已提交
1529

1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555
static size_t btree_gc_min_nodes(struct cache_set *c)
{
	size_t min_nodes;

	/*
	 * Since incremental GC would stop 100ms when front
	 * side I/O comes, so when there are many btree nodes,
	 * if GC only processes constant (100) nodes each time,
	 * GC would last a long time, and the front side I/Os
	 * would run out of the buckets (since no new bucket
	 * can be allocated during GC), and be blocked again.
	 * So GC should not process constant nodes, but varied
	 * nodes according to the number of btree nodes, which
	 * realized by dividing GC into constant(100) times,
	 * so when there are many btree nodes, GC can process
	 * more nodes each time, otherwise, GC will process less
	 * nodes each time (but no less than MIN_GC_NODES)
	 */
	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
	if (min_nodes < MIN_GC_NODES)
		min_nodes = MIN_GC_NODES;

	return min_nodes;
}


K
Kent Overstreet 已提交
1556 1557 1558 1559 1560 1561 1562
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
			    struct closure *writes, struct gc_stat *gc)
{
	int ret = 0;
	bool should_rewrite;
	struct bkey *k;
	struct btree_iter iter;
K
Kent Overstreet 已提交
1563
	struct gc_merge_info r[GC_MERGE_NODES];
K
Kent Overstreet 已提交
1564
	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
K
Kent Overstreet 已提交
1565

1566
	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
K
Kent Overstreet 已提交
1567

K
Kent Overstreet 已提交
1568 1569
	for (i = r; i < r + ARRAY_SIZE(r); i++)
		i->b = ERR_PTR(-EINTR);
K
Kent Overstreet 已提交
1570

K
Kent Overstreet 已提交
1571
	while (1) {
K
Kent Overstreet 已提交
1572
		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
K
Kent Overstreet 已提交
1573
		if (k) {
1574
			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1575
						  true, b);
K
Kent Overstreet 已提交
1576 1577 1578 1579 1580 1581 1582
			if (IS_ERR(r->b)) {
				ret = PTR_ERR(r->b);
				break;
			}

			r->keys = btree_gc_count_keys(r->b);

1583
			ret = btree_gc_coalesce(b, op, gc, r);
K
Kent Overstreet 已提交
1584 1585
			if (ret)
				break;
K
Kent Overstreet 已提交
1586 1587
		}

K
Kent Overstreet 已提交
1588 1589
		if (!last->b)
			break;
K
Kent Overstreet 已提交
1590

K
Kent Overstreet 已提交
1591 1592
		if (!IS_ERR(last->b)) {
			should_rewrite = btree_gc_mark_node(last->b, gc);
1593 1594 1595
			if (should_rewrite) {
				ret = btree_gc_rewrite_node(b, op, last->b);
				if (ret)
K
Kent Overstreet 已提交
1596 1597 1598 1599 1600 1601 1602 1603
					break;
			}

			if (last->b->level) {
				ret = btree_gc_recurse(last->b, op, writes, gc);
				if (ret)
					break;
			}
K
Kent Overstreet 已提交
1604

K
Kent Overstreet 已提交
1605 1606 1607 1608 1609 1610
			bkey_copy_key(&b->c->gc_done, &last->b->key);

			/*
			 * Must flush leaf nodes before gc ends, since replace
			 * operations aren't journalled
			 */
K
Kent Overstreet 已提交
1611
			mutex_lock(&last->b->write_lock);
K
Kent Overstreet 已提交
1612 1613
			if (btree_node_dirty(last->b))
				bch_btree_node_write(last->b, writes);
K
Kent Overstreet 已提交
1614
			mutex_unlock(&last->b->write_lock);
K
Kent Overstreet 已提交
1615 1616 1617 1618 1619
			rw_unlock(true, last->b);
		}

		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
		r->b = NULL;
K
Kent Overstreet 已提交
1620

T
Tang Junhui 已提交
1621
		if (atomic_read(&b->c->search_inflight) &&
1622
		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
T
Tang Junhui 已提交
1623 1624 1625 1626 1627
			gc->nodes_pre =  gc->nodes;
			ret = -EAGAIN;
			break;
		}

K
Kent Overstreet 已提交
1628 1629 1630 1631 1632 1633
		if (need_resched()) {
			ret = -EAGAIN;
			break;
		}
	}

K
Kent Overstreet 已提交
1634 1635 1636 1637 1638 1639 1640
	for (i = r; i < r + ARRAY_SIZE(r); i++)
		if (!IS_ERR_OR_NULL(i->b)) {
			mutex_lock(&i->b->write_lock);
			if (btree_node_dirty(i->b))
				bch_btree_node_write(i->b, writes);
			mutex_unlock(&i->b->write_lock);
			rw_unlock(true, i->b);
K
Kent Overstreet 已提交
1641
		}
K
Kent Overstreet 已提交
1642 1643 1644 1645 1646 1647 1648 1649

	return ret;
}

static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
			     struct closure *writes, struct gc_stat *gc)
{
	struct btree *n = NULL;
K
Kent Overstreet 已提交
1650 1651
	int ret = 0;
	bool should_rewrite;
K
Kent Overstreet 已提交
1652

K
Kent Overstreet 已提交
1653 1654
	should_rewrite = btree_gc_mark_node(b, gc);
	if (should_rewrite) {
1655
		n = btree_node_alloc_replacement(b, NULL);
K
Kent Overstreet 已提交
1656

K
Kent Overstreet 已提交
1657 1658
		if (!IS_ERR_OR_NULL(n)) {
			bch_btree_node_write_sync(n);
K
Kent Overstreet 已提交
1659

K
Kent Overstreet 已提交
1660 1661 1662
			bch_btree_set_root(n);
			btree_node_free(b);
			rw_unlock(true, n);
K
Kent Overstreet 已提交
1663

K
Kent Overstreet 已提交
1664 1665 1666
			return -EINTR;
		}
	}
K
Kent Overstreet 已提交
1667

1668 1669
	__bch_btree_mark_key(b->c, b->level + 1, &b->key);

K
Kent Overstreet 已提交
1670 1671 1672 1673
	if (b->level) {
		ret = btree_gc_recurse(b, op, writes, gc);
		if (ret)
			return ret;
K
Kent Overstreet 已提交
1674 1675
	}

K
Kent Overstreet 已提交
1676 1677
	bkey_copy_key(&b->c->gc_done, &b->key);

K
Kent Overstreet 已提交
1678 1679 1680 1681 1682 1683 1684
	return ret;
}

static void btree_gc_start(struct cache_set *c)
{
	struct cache *ca;
	struct bucket *b;
1685
	unsigned int i;
K
Kent Overstreet 已提交
1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696

	if (!c->gc_mark_valid)
		return;

	mutex_lock(&c->bucket_lock);

	c->gc_mark_valid = 0;
	c->gc_done = ZERO_KEY;

	for_each_cache(ca, c, i)
		for_each_bucket(b, ca) {
K
Kent Overstreet 已提交
1697
			b->last_gc = b->gen;
1698
			if (!atomic_read(&b->pin)) {
1699
				SET_GC_MARK(b, 0);
1700 1701
				SET_GC_SECTORS_USED(b, 0);
			}
K
Kent Overstreet 已提交
1702 1703 1704 1705 1706
		}

	mutex_unlock(&c->bucket_lock);
}

1707
static void bch_btree_gc_finish(struct cache_set *c)
K
Kent Overstreet 已提交
1708 1709 1710
{
	struct bucket *b;
	struct cache *ca;
1711
	unsigned int i;
K
Kent Overstreet 已提交
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722

	mutex_lock(&c->bucket_lock);

	set_gc_sectors(c);
	c->gc_mark_valid = 1;
	c->need_gc	= 0;

	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
			    GC_MARK_METADATA);

1723 1724
	/* don't reclaim buckets to which writeback keys point */
	rcu_read_lock();
1725
	for (i = 0; i < c->devices_max_used; i++) {
1726 1727 1728
		struct bcache_device *d = c->devices[i];
		struct cached_dev *dc;
		struct keybuf_key *w, *n;
1729
		unsigned int j;
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744

		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
			continue;
		dc = container_of(d, struct cached_dev, disk);

		spin_lock(&dc->writeback_keys.lock);
		rbtree_postorder_for_each_entry_safe(w, n,
					&dc->writeback_keys.keys, node)
			for (j = 0; j < KEY_PTRS(&w->key); j++)
				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
					    GC_MARK_DIRTY);
		spin_unlock(&dc->writeback_keys.lock);
	}
	rcu_read_unlock();

1745
	c->avail_nbuckets = 0;
K
Kent Overstreet 已提交
1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
	for_each_cache(ca, c, i) {
		uint64_t *i;

		ca->invalidate_needs_gc = 0;

		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);

		for (i = ca->prio_buckets;
		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);

		for_each_bucket(b, ca) {
			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));

1761 1762 1763 1764 1765 1766
			if (atomic_read(&b->pin))
				continue;

			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));

			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1767
				c->avail_nbuckets++;
K
Kent Overstreet 已提交
1768 1769 1770 1771 1772 1773
		}
	}

	mutex_unlock(&c->bucket_lock);
}

K
Kent Overstreet 已提交
1774
static void bch_btree_gc(struct cache_set *c)
K
Kent Overstreet 已提交
1775 1776 1777 1778 1779 1780
{
	int ret;
	struct gc_stat stats;
	struct closure writes;
	struct btree_op op;
	uint64_t start_time = local_clock();
K
Kent Overstreet 已提交
1781

K
Kent Overstreet 已提交
1782
	trace_bcache_gc_start(c);
K
Kent Overstreet 已提交
1783 1784 1785

	memset(&stats, 0, sizeof(struct gc_stat));
	closure_init_stack(&writes);
K
Kent Overstreet 已提交
1786
	bch_btree_op_init(&op, SHRT_MAX);
K
Kent Overstreet 已提交
1787 1788 1789

	btree_gc_start(c);

1790
	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
K
Kent Overstreet 已提交
1791 1792 1793
	do {
		ret = btree_root(gc_root, c, &op, &writes, &stats);
		closure_sync(&writes);
1794
		cond_resched();
K
Kent Overstreet 已提交
1795

T
Tang Junhui 已提交
1796 1797 1798 1799
		if (ret == -EAGAIN)
			schedule_timeout_interruptible(msecs_to_jiffies
						       (GC_SLEEP_MS));
		else if (ret)
K
Kent Overstreet 已提交
1800
			pr_warn("gc failed!");
1801
	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
K
Kent Overstreet 已提交
1802

1803
	bch_btree_gc_finish(c);
K
Kent Overstreet 已提交
1804 1805
	wake_up_allocators(c);

1806
	bch_time_stats_update(&c->btree_gc_time, start_time);
K
Kent Overstreet 已提交
1807 1808 1809

	stats.key_bytes *= sizeof(uint64_t);
	stats.data	<<= 9;
1810
	bch_update_bucket_in_use(c, &stats);
K
Kent Overstreet 已提交
1811 1812
	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));

K
Kent Overstreet 已提交
1813
	trace_bcache_gc_end(c);
K
Kent Overstreet 已提交
1814

K
Kent Overstreet 已提交
1815 1816 1817
	bch_moving_gc(c);
}

1818
static bool gc_should_run(struct cache_set *c)
K
Kent Overstreet 已提交
1819
{
K
Kent Overstreet 已提交
1820
	struct cache *ca;
1821
	unsigned int i;
K
Kent Overstreet 已提交
1822

1823 1824 1825
	for_each_cache(ca, c, i)
		if (ca->invalidate_needs_gc)
			return true;
K
Kent Overstreet 已提交
1826

1827 1828
	if (atomic_read(&c->sectors_to_gc) < 0)
		return true;
K
Kent Overstreet 已提交
1829

1830 1831
	return false;
}
K
Kent Overstreet 已提交
1832

1833 1834 1835
static int bch_gc_thread(void *arg)
{
	struct cache_set *c = arg;
K
Kent Overstreet 已提交
1836

1837 1838
	while (1) {
		wait_event_interruptible(c->gc_wait,
1839 1840 1841
			   kthread_should_stop() ||
			   test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
			   gc_should_run(c));
K
Kent Overstreet 已提交
1842

1843 1844
		if (kthread_should_stop() ||
		    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1845 1846 1847 1848
			break;

		set_gc_sectors(c);
		bch_btree_gc(c);
K
Kent Overstreet 已提交
1849 1850
	}

1851
	wait_for_kthread_stop();
K
Kent Overstreet 已提交
1852
	return 0;
K
Kent Overstreet 已提交
1853 1854
}

K
Kent Overstreet 已提交
1855
int bch_gc_thread_start(struct cache_set *c)
K
Kent Overstreet 已提交
1856
{
1857
	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
V
Vasyl Gomonovych 已提交
1858
	return PTR_ERR_OR_ZERO(c->gc_thread);
K
Kent Overstreet 已提交
1859 1860 1861 1862
}

/* Initial partial gc */

1863
static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
K
Kent Overstreet 已提交
1864
{
1865 1866
	int ret = 0;
	struct bkey *k, *p = NULL;
K
Kent Overstreet 已提交
1867 1868
	struct btree_iter iter;

1869 1870
	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
		bch_initial_mark_key(b->c, b->level, k);
K
Kent Overstreet 已提交
1871

1872
	bch_initial_mark_key(b->c, b->level + 1, &b->key);
K
Kent Overstreet 已提交
1873 1874

	if (b->level) {
1875
		bch_btree_iter_init(&b->keys, &iter, NULL);
K
Kent Overstreet 已提交
1876

1877
		do {
K
Kent Overstreet 已提交
1878 1879
			k = bch_btree_iter_next_filter(&iter, &b->keys,
						       bch_ptr_bad);
1880
			if (k) {
1881
				btree_node_prefetch(b, k);
1882 1883 1884 1885 1886 1887
				/*
				 * initiallize c->gc_stats.nodes
				 * for incremental GC
				 */
				b->c->gc_stats.nodes++;
			}
K
Kent Overstreet 已提交
1888

1889
			if (p)
1890
				ret = btree(check_recurse, p, b, op);
K
Kent Overstreet 已提交
1891

1892 1893
			p = k;
		} while (p && !ret);
K
Kent Overstreet 已提交
1894 1895
	}

1896
	return ret;
K
Kent Overstreet 已提交
1897 1898
}

K
Kent Overstreet 已提交
1899
int bch_btree_check(struct cache_set *c)
K
Kent Overstreet 已提交
1900
{
K
Kent Overstreet 已提交
1901
	struct btree_op op;
K
Kent Overstreet 已提交
1902

K
Kent Overstreet 已提交
1903
	bch_btree_op_init(&op, SHRT_MAX);
K
Kent Overstreet 已提交
1904

1905
	return btree_root(check_recurse, c, &op);
K
Kent Overstreet 已提交
1906 1907
}

K
Kent Overstreet 已提交
1908 1909 1910 1911
void bch_initial_gc_finish(struct cache_set *c)
{
	struct cache *ca;
	struct bucket *b;
1912
	unsigned int i;
K
Kent Overstreet 已提交
1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928

	bch_btree_gc_finish(c);

	mutex_lock(&c->bucket_lock);

	/*
	 * We need to put some unused buckets directly on the prio freelist in
	 * order to get the allocator thread started - it needs freed buckets in
	 * order to rewrite the prios and gens, and it needs to rewrite prios
	 * and gens in order to free buckets.
	 *
	 * This is only safe for buckets that have no live data in them, which
	 * there should always be some of.
	 */
	for_each_cache(ca, c, i) {
		for_each_bucket(b, ca) {
1929 1930
			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
			    fifo_full(&ca->free[RESERVE_BTREE]))
K
Kent Overstreet 已提交
1931 1932 1933 1934 1935
				break;

			if (bch_can_invalidate_bucket(ca, b) &&
			    !GC_MARK(b)) {
				__bch_invalidate_one_bucket(ca, b);
1936 1937 1938 1939
				if (!fifo_push(&ca->free[RESERVE_PRIO],
				   b - ca->buckets))
					fifo_push(&ca->free[RESERVE_BTREE],
						  b - ca->buckets);
K
Kent Overstreet 已提交
1940 1941 1942 1943 1944 1945 1946
			}
		}
	}

	mutex_unlock(&c->bucket_lock);
}

K
Kent Overstreet 已提交
1947 1948
/* Btree insertion */

1949 1950
static bool btree_insert_key(struct btree *b, struct bkey *k,
			     struct bkey *replace_key)
K
Kent Overstreet 已提交
1951
{
1952
	unsigned int status;
K
Kent Overstreet 已提交
1953 1954

	BUG_ON(bkey_cmp(k, &b->key) > 0);
1955

1956 1957 1958 1959
	status = bch_btree_insert_key(&b->keys, k, replace_key);
	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
		bch_check_keys(&b->keys, "%u for %s", status,
			       replace_key ? "replace" : "insert");
K
Kent Overstreet 已提交
1960

1961 1962 1963 1964 1965
		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
					      status);
		return true;
	} else
		return false;
K
Kent Overstreet 已提交
1966 1967
}

1968 1969
static size_t insert_u64s_remaining(struct btree *b)
{
1970
	long ret = bch_btree_keys_u64s_remaining(&b->keys);
1971 1972 1973 1974 1975 1976 1977 1978 1979 1980

	/*
	 * Might land in the middle of an existing extent and have to split it
	 */
	if (b->keys.ops->is_extents)
		ret -= KEY_MAX_U64S;

	return max(ret, 0L);
}

K
Kent Overstreet 已提交
1981
static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
K
Kent Overstreet 已提交
1982 1983
				  struct keylist *insert_keys,
				  struct bkey *replace_key)
K
Kent Overstreet 已提交
1984 1985
{
	bool ret = false;
1986
	int oldsize = bch_count_data(&b->keys);
K
Kent Overstreet 已提交
1987

K
Kent Overstreet 已提交
1988
	while (!bch_keylist_empty(insert_keys)) {
K
Kent Overstreet 已提交
1989
		struct bkey *k = insert_keys->keys;
K
Kent Overstreet 已提交
1990

1991
		if (bkey_u64s(k) > insert_u64s_remaining(b))
1992 1993 1994
			break;

		if (bkey_cmp(k, &b->key) <= 0) {
1995 1996
			if (!b->level)
				bkey_put(b->c, k);
K
Kent Overstreet 已提交
1997

1998
			ret |= btree_insert_key(b, k, replace_key);
K
Kent Overstreet 已提交
1999 2000 2001
			bch_keylist_pop_front(insert_keys);
		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
			BKEY_PADDED(key) temp;
K
Kent Overstreet 已提交
2002
			bkey_copy(&temp.key, insert_keys->keys);
K
Kent Overstreet 已提交
2003 2004

			bch_cut_back(&b->key, &temp.key);
K
Kent Overstreet 已提交
2005
			bch_cut_front(&b->key, insert_keys->keys);
K
Kent Overstreet 已提交
2006

2007
			ret |= btree_insert_key(b, &temp.key, replace_key);
K
Kent Overstreet 已提交
2008 2009 2010 2011
			break;
		} else {
			break;
		}
K
Kent Overstreet 已提交
2012 2013
	}

2014 2015 2016
	if (!ret)
		op->insert_collision = true;

2017 2018
	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);

2019
	BUG_ON(bch_count_data(&b->keys) < oldsize);
K
Kent Overstreet 已提交
2020 2021 2022
	return ret;
}

K
Kent Overstreet 已提交
2023 2024
static int btree_split(struct btree *b, struct btree_op *op,
		       struct keylist *insert_keys,
K
Kent Overstreet 已提交
2025
		       struct bkey *replace_key)
K
Kent Overstreet 已提交
2026
{
2027
	bool split;
K
Kent Overstreet 已提交
2028 2029
	struct btree *n1, *n2 = NULL, *n3 = NULL;
	uint64_t start_time = local_clock();
K
Kent Overstreet 已提交
2030
	struct closure cl;
2031
	struct keylist parent_keys;
K
Kent Overstreet 已提交
2032 2033

	closure_init_stack(&cl);
2034
	bch_keylist_init(&parent_keys);
K
Kent Overstreet 已提交
2035

2036 2037 2038 2039 2040 2041
	if (btree_check_reserve(b, op)) {
		if (!b->level)
			return -EINTR;
		else
			WARN(1, "insufficient reserve for split\n");
	}
2042

2043
	n1 = btree_node_alloc_replacement(b, op);
K
Kent Overstreet 已提交
2044 2045 2046
	if (IS_ERR(n1))
		goto err;

2047 2048
	split = set_blocks(btree_bset_first(n1),
			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
K
Kent Overstreet 已提交
2049 2050

	if (split) {
2051
		unsigned int keys = 0;
K
Kent Overstreet 已提交
2052

2053
		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
K
Kent Overstreet 已提交
2054

2055
		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
K
Kent Overstreet 已提交
2056 2057 2058
		if (IS_ERR(n2))
			goto err_free1;

2059
		if (!b->parent) {
2060
			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
K
Kent Overstreet 已提交
2061 2062 2063 2064
			if (IS_ERR(n3))
				goto err_free2;
		}

K
Kent Overstreet 已提交
2065 2066 2067
		mutex_lock(&n1->write_lock);
		mutex_lock(&n2->write_lock);

K
Kent Overstreet 已提交
2068
		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
K
Kent Overstreet 已提交
2069

2070 2071
		/*
		 * Has to be a linear search because we don't have an auxiliary
K
Kent Overstreet 已提交
2072 2073 2074
		 * search tree yet
		 */

2075 2076
		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
K
Kent Overstreet 已提交
2077
							keys));
K
Kent Overstreet 已提交
2078

K
Kent Overstreet 已提交
2079
		bkey_copy_key(&n1->key,
2080 2081
			      bset_bkey_idx(btree_bset_first(n1), keys));
		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
K
Kent Overstreet 已提交
2082

2083 2084
		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
		btree_bset_first(n1)->keys = keys;
K
Kent Overstreet 已提交
2085

2086 2087 2088
		memcpy(btree_bset_first(n2)->start,
		       bset_bkey_last(btree_bset_first(n1)),
		       btree_bset_first(n2)->keys * sizeof(uint64_t));
K
Kent Overstreet 已提交
2089 2090 2091

		bkey_copy_key(&n2->key, &b->key);

2092
		bch_keylist_add(&parent_keys, &n2->key);
K
Kent Overstreet 已提交
2093
		bch_btree_node_write(n2, &cl);
K
Kent Overstreet 已提交
2094
		mutex_unlock(&n2->write_lock);
K
Kent Overstreet 已提交
2095
		rw_unlock(true, n2);
K
Kent Overstreet 已提交
2096
	} else {
2097
		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
K
Kent Overstreet 已提交
2098

K
Kent Overstreet 已提交
2099
		mutex_lock(&n1->write_lock);
K
Kent Overstreet 已提交
2100
		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
K
Kent Overstreet 已提交
2101
	}
K
Kent Overstreet 已提交
2102

2103
	bch_keylist_add(&parent_keys, &n1->key);
K
Kent Overstreet 已提交
2104
	bch_btree_node_write(n1, &cl);
K
Kent Overstreet 已提交
2105
	mutex_unlock(&n1->write_lock);
K
Kent Overstreet 已提交
2106 2107

	if (n3) {
2108
		/* Depth increases, make a new root */
K
Kent Overstreet 已提交
2109
		mutex_lock(&n3->write_lock);
K
Kent Overstreet 已提交
2110
		bkey_copy_key(&n3->key, &MAX_KEY);
2111
		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
K
Kent Overstreet 已提交
2112
		bch_btree_node_write(n3, &cl);
K
Kent Overstreet 已提交
2113
		mutex_unlock(&n3->write_lock);
K
Kent Overstreet 已提交
2114

K
Kent Overstreet 已提交
2115
		closure_sync(&cl);
K
Kent Overstreet 已提交
2116 2117
		bch_btree_set_root(n3);
		rw_unlock(true, n3);
2118 2119
	} else if (!b->parent) {
		/* Root filled up but didn't need to be split */
K
Kent Overstreet 已提交
2120
		closure_sync(&cl);
K
Kent Overstreet 已提交
2121 2122
		bch_btree_set_root(n1);
	} else {
2123
		/* Split a non root node */
K
Kent Overstreet 已提交
2124
		closure_sync(&cl);
2125 2126 2127 2128 2129
		make_btree_freeing_key(b, parent_keys.top);
		bch_keylist_push(&parent_keys);

		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
		BUG_ON(!bch_keylist_empty(&parent_keys));
K
Kent Overstreet 已提交
2130 2131
	}

2132
	btree_node_free(b);
K
Kent Overstreet 已提交
2133 2134
	rw_unlock(true, n1);

2135
	bch_time_stats_update(&b->c->btree_split_time, start_time);
K
Kent Overstreet 已提交
2136 2137 2138

	return 0;
err_free2:
2139
	bkey_put(b->c, &n2->key);
2140
	btree_node_free(n2);
K
Kent Overstreet 已提交
2141 2142
	rw_unlock(true, n2);
err_free1:
2143
	bkey_put(b->c, &n1->key);
2144
	btree_node_free(n1);
K
Kent Overstreet 已提交
2145 2146
	rw_unlock(true, n1);
err:
2147
	WARN(1, "bcache: btree split failed (level %u)", b->level);
2148

K
Kent Overstreet 已提交
2149 2150 2151 2152 2153 2154 2155 2156
	if (n3 == ERR_PTR(-EAGAIN) ||
	    n2 == ERR_PTR(-EAGAIN) ||
	    n1 == ERR_PTR(-EAGAIN))
		return -EAGAIN;

	return -ENOMEM;
}

K
Kent Overstreet 已提交
2157
static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
K
Kent Overstreet 已提交
2158
				 struct keylist *insert_keys,
K
Kent Overstreet 已提交
2159 2160
				 atomic_t *journal_ref,
				 struct bkey *replace_key)
K
Kent Overstreet 已提交
2161
{
K
Kent Overstreet 已提交
2162 2163
	struct closure cl;

2164 2165
	BUG_ON(b->level && replace_key);

K
Kent Overstreet 已提交
2166 2167 2168 2169 2170 2171 2172 2173
	closure_init_stack(&cl);

	mutex_lock(&b->write_lock);

	if (write_block(b) != btree_bset_last(b) &&
	    b->keys.last_set_unwritten)
		bch_btree_init_next(b); /* just wrote a set */

2174
	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
K
Kent Overstreet 已提交
2175 2176 2177
		mutex_unlock(&b->write_lock);
		goto split;
	}
2178

K
Kent Overstreet 已提交
2179
	BUG_ON(write_block(b) != btree_bset_last(b));
K
Kent Overstreet 已提交
2180

K
Kent Overstreet 已提交
2181 2182 2183 2184 2185 2186
	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
		if (!b->level)
			bch_btree_leaf_dirty(b, journal_ref);
		else
			bch_btree_node_write(b, &cl);
	}
2187

K
Kent Overstreet 已提交
2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209
	mutex_unlock(&b->write_lock);

	/* wait for btree node write if necessary, after unlock */
	closure_sync(&cl);

	return 0;
split:
	if (current->bio_list) {
		op->lock = b->c->root->level + 1;
		return -EAGAIN;
	} else if (op->lock <= b->c->root->level) {
		op->lock = b->c->root->level + 1;
		return -EINTR;
	} else {
		/* Invalidated all iterators */
		int ret = btree_split(b, op, insert_keys, replace_key);

		if (bch_keylist_empty(insert_keys))
			return 0;
		else if (!ret)
			return -EINTR;
		return ret;
2210
	}
K
Kent Overstreet 已提交
2211
}
K
Kent Overstreet 已提交
2212

2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228
int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
			       struct bkey *check_key)
{
	int ret = -EINTR;
	uint64_t btree_ptr = b->key.ptr[0];
	unsigned long seq = b->seq;
	struct keylist insert;
	bool upgrade = op->lock == -1;

	bch_keylist_init(&insert);

	if (upgrade) {
		rw_unlock(false, b);
		rw_lock(true, b, b->level);

		if (b->key.ptr[0] != btree_ptr ||
2229
                   b->seq != seq + 1) {
B
Bart Van Assche 已提交
2230
			op->lock = b->level;
2231
			goto out;
2232
               }
2233 2234 2235 2236 2237 2238 2239 2240 2241
	}

	SET_KEY_PTRS(check_key, 1);
	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));

	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);

	bch_keylist_add(&insert, check_key);

K
Kent Overstreet 已提交
2242
	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2243 2244 2245 2246 2247 2248 2249 2250

	BUG_ON(!ret && !bch_keylist_empty(&insert));
out:
	if (upgrade)
		downgrade_write(&b->lock);
	return ret;
}

2251 2252 2253 2254 2255 2256
struct btree_insert_op {
	struct btree_op	op;
	struct keylist	*keys;
	atomic_t	*journal_ref;
	struct bkey	*replace_key;
};
K
Kent Overstreet 已提交
2257

2258
static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2259 2260 2261
{
	struct btree_insert_op *op = container_of(b_op,
					struct btree_insert_op, op);
K
Kent Overstreet 已提交
2262

2263 2264 2265 2266 2267 2268
	int ret = bch_btree_insert_node(b, &op->op, op->keys,
					op->journal_ref, op->replace_key);
	if (ret && !bch_keylist_empty(op->keys))
		return ret;
	else
		return MAP_DONE;
K
Kent Overstreet 已提交
2269 2270
}

2271 2272
int bch_btree_insert(struct cache_set *c, struct keylist *keys,
		     atomic_t *journal_ref, struct bkey *replace_key)
K
Kent Overstreet 已提交
2273
{
2274
	struct btree_insert_op op;
K
Kent Overstreet 已提交
2275 2276
	int ret = 0;

2277
	BUG_ON(current->bio_list);
2278
	BUG_ON(bch_keylist_empty(keys));
K
Kent Overstreet 已提交
2279

2280 2281 2282 2283
	bch_btree_op_init(&op.op, 0);
	op.keys		= keys;
	op.journal_ref	= journal_ref;
	op.replace_key	= replace_key;
K
Kent Overstreet 已提交
2284

2285 2286 2287 2288 2289 2290
	while (!ret && !bch_keylist_empty(keys)) {
		op.op.lock = 0;
		ret = bch_btree_map_leaf_nodes(&op.op, c,
					       &START_KEY(keys->keys),
					       btree_insert_fn);
	}
K
Kent Overstreet 已提交
2291

2292 2293
	if (ret) {
		struct bkey *k;
K
Kent Overstreet 已提交
2294

2295
		pr_err("error %i", ret);
K
Kent Overstreet 已提交
2296

2297
		while ((k = bch_keylist_pop(keys)))
2298
			bkey_put(c, k);
2299 2300
	} else if (op.op.insert_collision)
		ret = -ESRCH;
2301

K
Kent Overstreet 已提交
2302 2303 2304 2305 2306
	return ret;
}

void bch_btree_set_root(struct btree *b)
{
2307
	unsigned int i;
K
Kent Overstreet 已提交
2308 2309 2310
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
2311

K
Kent Overstreet 已提交
2312 2313
	trace_bcache_btree_set_root(b);

K
Kent Overstreet 已提交
2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324
	BUG_ON(!b->written);

	for (i = 0; i < KEY_PTRS(&b->key); i++)
		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);

	mutex_lock(&b->c->bucket_lock);
	list_del_init(&b->list);
	mutex_unlock(&b->c->bucket_lock);

	b->c->root = b;

K
Kent Overstreet 已提交
2325 2326
	bch_journal_meta(b->c, &cl);
	closure_sync(&cl);
K
Kent Overstreet 已提交
2327 2328
}

2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340
/* Map across nodes or keys */

static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
				       struct bkey *from,
				       btree_map_nodes_fn *fn, int flags)
{
	int ret = MAP_CONTINUE;

	if (b->level) {
		struct bkey *k;
		struct btree_iter iter;

2341
		bch_btree_iter_init(&b->keys, &iter, from);
2342

K
Kent Overstreet 已提交
2343
		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
						       bch_ptr_bad))) {
			ret = btree(map_nodes_recurse, k, b,
				    op, from, fn, flags);
			from = NULL;

			if (ret != MAP_CONTINUE)
				return ret;
		}
	}

	if (!b->level || flags == MAP_ALL_NODES)
		ret = fn(op, b);

	return ret;
}

int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
{
K
Kent Overstreet 已提交
2363
	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
}

static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
				      struct bkey *from, btree_map_keys_fn *fn,
				      int flags)
{
	int ret = MAP_CONTINUE;
	struct bkey *k;
	struct btree_iter iter;

2374
	bch_btree_iter_init(&b->keys, &iter, from);
2375

K
Kent Overstreet 已提交
2376
	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
		ret = !b->level
			? fn(op, b, k)
			: btree(map_keys_recurse, k, b, op, from, fn, flags);
		from = NULL;

		if (ret != MAP_CONTINUE)
			return ret;
	}

	if (!b->level && (flags & MAP_END_KEY))
		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
				     KEY_OFFSET(&b->key), 0));

	return ret;
}

int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
		       struct bkey *from, btree_map_keys_fn *fn, int flags)
{
K
Kent Overstreet 已提交
2396
	return btree_root(map_keys_recurse, c, op, from, fn, flags);
2397 2398
}

K
Kent Overstreet 已提交
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416
/* Keybuf code */

static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
{
	/* Overlapping keys compare equal */
	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
		return -1;
	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
		return 1;
	return 0;
}

static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
					    struct keybuf_key *r)
{
	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
}

2417 2418
struct refill {
	struct btree_op	op;
2419
	unsigned int	nr_found;
2420 2421 2422 2423
	struct keybuf	*buf;
	struct bkey	*end;
	keybuf_pred_fn	*pred;
};
K
Kent Overstreet 已提交
2424

2425 2426 2427 2428 2429 2430
static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
			    struct bkey *k)
{
	struct refill *refill = container_of(op, struct refill, op);
	struct keybuf *buf = refill->buf;
	int ret = MAP_CONTINUE;
K
Kent Overstreet 已提交
2431

2432 2433 2434 2435
	if (bkey_cmp(k, refill->end) >= 0) {
		ret = MAP_DONE;
		goto out;
	}
K
Kent Overstreet 已提交
2436

2437 2438
	if (!KEY_SIZE(k)) /* end key */
		goto out;
K
Kent Overstreet 已提交
2439

2440 2441
	if (refill->pred(buf, k)) {
		struct keybuf_key *w;
K
Kent Overstreet 已提交
2442

2443
		spin_lock(&buf->lock);
K
Kent Overstreet 已提交
2444

2445 2446 2447 2448 2449
		w = array_alloc(&buf->freelist);
		if (!w) {
			spin_unlock(&buf->lock);
			return MAP_DONE;
		}
K
Kent Overstreet 已提交
2450

2451 2452
		w->private = NULL;
		bkey_copy(&w->key, k);
K
Kent Overstreet 已提交
2453

2454 2455
		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
			array_free(&buf->freelist, w);
2456 2457
		else
			refill->nr_found++;
K
Kent Overstreet 已提交
2458

2459 2460
		if (array_freelist_empty(&buf->freelist))
			ret = MAP_DONE;
K
Kent Overstreet 已提交
2461

2462
		spin_unlock(&buf->lock);
K
Kent Overstreet 已提交
2463
	}
2464 2465 2466
out:
	buf->last_scanned = *k;
	return ret;
K
Kent Overstreet 已提交
2467 2468 2469
}

void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
K
Kent Overstreet 已提交
2470
		       struct bkey *end, keybuf_pred_fn *pred)
K
Kent Overstreet 已提交
2471 2472
{
	struct bkey start = buf->last_scanned;
2473
	struct refill refill;
K
Kent Overstreet 已提交
2474 2475 2476

	cond_resched();

K
Kent Overstreet 已提交
2477
	bch_btree_op_init(&refill.op, -1);
2478 2479 2480 2481
	refill.nr_found	= 0;
	refill.buf	= buf;
	refill.end	= end;
	refill.pred	= pred;
2482 2483 2484

	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
			   refill_keybuf_fn, MAP_END_KEY);
K
Kent Overstreet 已提交
2485

2486 2487 2488 2489
	trace_bcache_keyscan(refill.nr_found,
			     KEY_INODE(&start), KEY_OFFSET(&start),
			     KEY_INODE(&buf->last_scanned),
			     KEY_OFFSET(&buf->last_scanned));
K
Kent Overstreet 已提交
2490 2491 2492 2493 2494

	spin_lock(&buf->lock);

	if (!RB_EMPTY_ROOT(&buf->keys)) {
		struct keybuf_key *w;
2495

K
Kent Overstreet 已提交
2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526
		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
		buf->start	= START_KEY(&w->key);

		w = RB_LAST(&buf->keys, struct keybuf_key, node);
		buf->end	= w->key;
	} else {
		buf->start	= MAX_KEY;
		buf->end	= MAX_KEY;
	}

	spin_unlock(&buf->lock);
}

static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{
	rb_erase(&w->node, &buf->keys);
	array_free(&buf->freelist, w);
}

void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{
	spin_lock(&buf->lock);
	__bch_keybuf_del(buf, w);
	spin_unlock(&buf->lock);
}

bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
				  struct bkey *end)
{
	bool ret = false;
	struct keybuf_key *p, *w, s;
2527

K
Kent Overstreet 已提交
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
	s.key = *start;

	if (bkey_cmp(end, &buf->start) <= 0 ||
	    bkey_cmp(start, &buf->end) >= 0)
		return false;

	spin_lock(&buf->lock);
	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);

	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
		p = w;
		w = RB_NEXT(w, node);

		if (p->private)
			ret = true;
		else
			__bch_keybuf_del(buf, p);
	}

	spin_unlock(&buf->lock);
	return ret;
}

struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
{
	struct keybuf_key *w;
2554

K
Kent Overstreet 已提交
2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
	spin_lock(&buf->lock);

	w = RB_FIRST(&buf->keys, struct keybuf_key, node);

	while (w && w->private)
		w = RB_NEXT(w, node);

	if (w)
		w->private = ERR_PTR(-EINTR);

	spin_unlock(&buf->lock);
	return w;
}

struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2570 2571 2572
					  struct keybuf *buf,
					  struct bkey *end,
					  keybuf_pred_fn *pred)
K
Kent Overstreet 已提交
2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
{
	struct keybuf_key *ret;

	while (1) {
		ret = bch_keybuf_next(buf);
		if (ret)
			break;

		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
			pr_debug("scan finished");
			break;
		}

K
Kent Overstreet 已提交
2586
		bch_refill_keybuf(c, buf, end, pred);
K
Kent Overstreet 已提交
2587 2588 2589 2590 2591
	}

	return ret;
}

K
Kent Overstreet 已提交
2592
void bch_keybuf_init(struct keybuf *buf)
K
Kent Overstreet 已提交
2593 2594 2595 2596 2597 2598 2599
{
	buf->last_scanned	= MAX_KEY;
	buf->keys		= RB_ROOT;

	spin_lock_init(&buf->lock);
	array_allocator_init(&buf->freelist);
}