super.c 58.4 KB
Newer Older
C
Coly Li 已提交
1
// SPDX-License-Identifier: GPL-2.0
K
Kent Overstreet 已提交
2 3 4 5 6 7 8 9 10 11 12
/*
 * bcache setup/teardown code, and some metadata io - read a superblock and
 * figure out what to do with it.
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
13
#include "extents.h"
K
Kent Overstreet 已提交
14
#include "request.h"
15
#include "writeback.h"
K
Kent Overstreet 已提交
16

K
Kent Overstreet 已提交
17
#include <linux/blkdev.h>
K
Kent Overstreet 已提交
18 19 20
#include <linux/buffer_head.h>
#include <linux/debugfs.h>
#include <linux/genhd.h>
21
#include <linux/idr.h>
22
#include <linux/kthread.h>
K
Kent Overstreet 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/sysfs.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");

static const char bcache_magic[] = {
	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
};

static const char invalid_uuid[] = {
	0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
};

static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);

46
static int bcache_major;
47
static DEFINE_IDA(bcache_device_idx);
K
Kent Overstreet 已提交
48 49 50 51
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;

#define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
52 53 54 55
/* limitation of partitions number on single bcache device */
#define BCACHE_MINORS		128
/* limitation of bcache devices number on single system */
#define BCACHE_DEVICE_IDX_MAX	((1U << MINORBITS)/BCACHE_MINORS)
K
Kent Overstreet 已提交
56 57 58 59 60 61 62 63 64

/* Superblock */

static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
			      struct page **res)
{
	const char *err;
	struct cache_sb *s;
	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
65
	unsigned int i;
K
Kent Overstreet 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107

	if (!bh)
		return "IO error";

	s = (struct cache_sb *) bh->b_data;

	sb->offset		= le64_to_cpu(s->offset);
	sb->version		= le64_to_cpu(s->version);

	memcpy(sb->magic,	s->magic, 16);
	memcpy(sb->uuid,	s->uuid, 16);
	memcpy(sb->set_uuid,	s->set_uuid, 16);
	memcpy(sb->label,	s->label, SB_LABEL_SIZE);

	sb->flags		= le64_to_cpu(s->flags);
	sb->seq			= le64_to_cpu(s->seq);
	sb->last_mount		= le32_to_cpu(s->last_mount);
	sb->first_bucket	= le16_to_cpu(s->first_bucket);
	sb->keys		= le16_to_cpu(s->keys);

	for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
		sb->d[i] = le64_to_cpu(s->d[i]);

	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
		 sb->version, sb->flags, sb->seq, sb->keys);

	err = "Not a bcache superblock";
	if (sb->offset != SB_SECTOR)
		goto err;

	if (memcmp(sb->magic, bcache_magic, 16))
		goto err;

	err = "Too many journal buckets";
	if (sb->keys > SB_JOURNAL_BUCKETS)
		goto err;

	err = "Bad checksum";
	if (s->csum != csum_set(s))
		goto err;

	err = "Bad UUID";
108
	if (bch_is_zero(sb->uuid, 16))
K
Kent Overstreet 已提交
109 110
		goto err;

111 112 113 114 115 116
	sb->block_size	= le16_to_cpu(s->block_size);

	err = "Superblock block size smaller than device block size";
	if (sb->block_size << 9 < bdev_logical_block_size(bdev))
		goto err;

117 118 119 120 121 122 123 124 125 126
	switch (sb->version) {
	case BCACHE_SB_VERSION_BDEV:
		sb->data_offset	= BDEV_DATA_START_DEFAULT;
		break;
	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
		sb->data_offset	= le64_to_cpu(s->data_offset);

		err = "Bad data offset";
		if (sb->data_offset < BDEV_DATA_START_DEFAULT)
			goto err;
K
Kent Overstreet 已提交
127

128 129 130 131 132
		break;
	case BCACHE_SB_VERSION_CDEV:
	case BCACHE_SB_VERSION_CDEV_WITH_UUID:
		sb->nbuckets	= le64_to_cpu(s->nbuckets);
		sb->bucket_size	= le16_to_cpu(s->bucket_size);
K
Kent Overstreet 已提交
133

134 135
		sb->nr_in_set	= le16_to_cpu(s->nr_in_set);
		sb->nr_this_dev	= le16_to_cpu(s->nr_this_dev);
K
Kent Overstreet 已提交
136

137 138 139
		err = "Too many buckets";
		if (sb->nbuckets > LONG_MAX)
			goto err;
K
Kent Overstreet 已提交
140

141 142 143
		err = "Not enough buckets";
		if (sb->nbuckets < 1 << 7)
			goto err;
K
Kent Overstreet 已提交
144

145 146 147 148 149 150
		err = "Bad block/bucket size";
		if (!is_power_of_2(sb->block_size) ||
		    sb->block_size > PAGE_SECTORS ||
		    !is_power_of_2(sb->bucket_size) ||
		    sb->bucket_size < PAGE_SECTORS)
			goto err;
K
Kent Overstreet 已提交
151

152
		err = "Invalid superblock: device too small";
153 154
		if (get_capacity(bdev->bd_disk) <
		    sb->bucket_size * sb->nbuckets)
155
			goto err;
K
Kent Overstreet 已提交
156

157 158 159
		err = "Bad UUID";
		if (bch_is_zero(sb->set_uuid, 16))
			goto err;
K
Kent Overstreet 已提交
160

161 162 163 164
		err = "Bad cache device number in set";
		if (!sb->nr_in_set ||
		    sb->nr_in_set <= sb->nr_this_dev ||
		    sb->nr_in_set > MAX_CACHES_PER_SET)
K
Kent Overstreet 已提交
165 166
			goto err;

167 168 169 170
		err = "Journal buckets not sequential";
		for (i = 0; i < sb->keys; i++)
			if (sb->d[i] != sb->first_bucket + i)
				goto err;
K
Kent Overstreet 已提交
171

172 173 174 175 176 177 178 179 180 181 182
		err = "Too many journal buckets";
		if (sb->first_bucket + sb->keys > sb->nbuckets)
			goto err;

		err = "Invalid superblock: first bucket comes before end of super";
		if (sb->first_bucket * sb->bucket_size < 16)
			goto err;

		break;
	default:
		err = "Unsupported superblock version";
K
Kent Overstreet 已提交
183
		goto err;
184 185
	}

186
	sb->last_mount = (u32)ktime_get_real_seconds();
K
Kent Overstreet 已提交
187 188 189 190 191 192 193 194 195
	err = NULL;

	get_page(bh->b_page);
	*res = bh->b_page;
err:
	put_bh(bh);
	return err;
}

196
static void write_bdev_super_endio(struct bio *bio)
K
Kent Overstreet 已提交
197 198 199 200
{
	struct cached_dev *dc = bio->bi_private;
	/* XXX: error checking */

201
	closure_put(&dc->sb_write);
K
Kent Overstreet 已提交
202 203 204 205
}

static void __write_super(struct cache_sb *sb, struct bio *bio)
{
206
	struct cache_sb *out = page_address(bio_first_page_all(bio));
207
	unsigned int i;
K
Kent Overstreet 已提交
208

209 210
	bio->bi_iter.bi_sector	= SB_SECTOR;
	bio->bi_iter.bi_size	= SB_SIZE;
M
Mike Christie 已提交
211
	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
212
	bch_bio_map(bio, NULL);
K
Kent Overstreet 已提交
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235

	out->offset		= cpu_to_le64(sb->offset);
	out->version		= cpu_to_le64(sb->version);

	memcpy(out->uuid,	sb->uuid, 16);
	memcpy(out->set_uuid,	sb->set_uuid, 16);
	memcpy(out->label,	sb->label, SB_LABEL_SIZE);

	out->flags		= cpu_to_le64(sb->flags);
	out->seq		= cpu_to_le64(sb->seq);

	out->last_mount		= cpu_to_le32(sb->last_mount);
	out->first_bucket	= cpu_to_le16(sb->first_bucket);
	out->keys		= cpu_to_le16(sb->keys);

	for (i = 0; i < sb->keys; i++)
		out->d[i] = cpu_to_le64(sb->d[i]);

	out->csum = csum_set(out);

	pr_debug("ver %llu, flags %llu, seq %llu",
		 sb->version, sb->flags, sb->seq);

236
	submit_bio(bio);
K
Kent Overstreet 已提交
237 238
}

239 240 241 242 243 244 245
static void bch_write_bdev_super_unlock(struct closure *cl)
{
	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);

	up(&dc->sb_write_mutex);
}

K
Kent Overstreet 已提交
246 247
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
{
248
	struct closure *cl = &dc->sb_write;
K
Kent Overstreet 已提交
249 250
	struct bio *bio = &dc->sb_bio;

251 252
	down(&dc->sb_write_mutex);
	closure_init(cl, parent);
K
Kent Overstreet 已提交
253 254

	bio_reset(bio);
255
	bio_set_dev(bio, dc->bdev);
K
Kent Overstreet 已提交
256 257 258 259
	bio->bi_end_io	= write_bdev_super_endio;
	bio->bi_private = dc;

	closure_get(cl);
260
	/* I/O request sent to backing device */
K
Kent Overstreet 已提交
261 262
	__write_super(&dc->sb, bio);

263
	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
K
Kent Overstreet 已提交
264 265
}

266
static void write_super_endio(struct bio *bio)
K
Kent Overstreet 已提交
267 268 269
{
	struct cache *ca = bio->bi_private;

270 271 272
	/* is_read = 0 */
	bch_count_io_errors(ca, bio->bi_status, 0,
			    "writing superblock");
273 274 275 276 277 278 279 280
	closure_put(&ca->set->sb_write);
}

static void bcache_write_super_unlock(struct closure *cl)
{
	struct cache_set *c = container_of(cl, struct cache_set, sb_write);

	up(&c->sb_write_mutex);
K
Kent Overstreet 已提交
281 282 283 284
}

void bcache_write_super(struct cache_set *c)
{
285
	struct closure *cl = &c->sb_write;
K
Kent Overstreet 已提交
286
	struct cache *ca;
287
	unsigned int i;
K
Kent Overstreet 已提交
288

289 290
	down(&c->sb_write_mutex);
	closure_init(cl, &c->cl);
K
Kent Overstreet 已提交
291 292 293 294 295 296

	c->sb.seq++;

	for_each_cache(ca, c, i) {
		struct bio *bio = &ca->sb_bio;

297
		ca->sb.version		= BCACHE_SB_VERSION_CDEV_WITH_UUID;
K
Kent Overstreet 已提交
298 299 300 301 302 303
		ca->sb.seq		= c->sb.seq;
		ca->sb.last_mount	= c->sb.last_mount;

		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));

		bio_reset(bio);
304
		bio_set_dev(bio, ca->bdev);
K
Kent Overstreet 已提交
305 306 307 308 309 310 311
		bio->bi_end_io	= write_super_endio;
		bio->bi_private = ca;

		closure_get(cl);
		__write_super(&ca->sb, bio);
	}

312
	closure_return_with_destructor(cl, bcache_write_super_unlock);
K
Kent Overstreet 已提交
313 314 315 316
}

/* UUID io */

317
static void uuid_endio(struct bio *bio)
K
Kent Overstreet 已提交
318 319
{
	struct closure *cl = bio->bi_private;
320
	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
K
Kent Overstreet 已提交
321

322
	cache_set_err_on(bio->bi_status, c, "accessing uuids");
K
Kent Overstreet 已提交
323 324 325 326
	bch_bbio_free(bio, c);
	closure_put(cl);
}

327 328 329 330 331 332 333
static void uuid_io_unlock(struct closure *cl)
{
	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);

	up(&c->uuid_write_mutex);
}

M
Mike Christie 已提交
334
static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
K
Kent Overstreet 已提交
335 336
		    struct bkey *k, struct closure *parent)
{
337
	struct closure *cl = &c->uuid_write;
K
Kent Overstreet 已提交
338
	struct uuid_entry *u;
339
	unsigned int i;
340
	char buf[80];
K
Kent Overstreet 已提交
341 342

	BUG_ON(!parent);
343 344
	down(&c->uuid_write_mutex);
	closure_init(cl, parent);
K
Kent Overstreet 已提交
345 346 347 348

	for (i = 0; i < KEY_PTRS(k); i++) {
		struct bio *bio = bch_bbio_alloc(c);

J
Jens Axboe 已提交
349
		bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
350
		bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
K
Kent Overstreet 已提交
351 352 353

		bio->bi_end_io	= uuid_endio;
		bio->bi_private = cl;
M
Mike Christie 已提交
354
		bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
355
		bch_bio_map(bio, c->uuids);
K
Kent Overstreet 已提交
356 357 358

		bch_submit_bbio(bio, c, k, i);

M
Mike Christie 已提交
359
		if (op != REQ_OP_WRITE)
K
Kent Overstreet 已提交
360 361 362
			break;
	}

363
	bch_extent_to_text(buf, sizeof(buf), k);
M
Mike Christie 已提交
364
	pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
K
Kent Overstreet 已提交
365 366

	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
367
		if (!bch_is_zero(u->uuid, 16))
K
Kent Overstreet 已提交
368 369 370 371
			pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
				 u - c->uuids, u->uuid, u->label,
				 u->first_reg, u->last_reg, u->invalidated);

372
	closure_return_with_destructor(cl, uuid_io_unlock);
K
Kent Overstreet 已提交
373 374 375 376 377 378
}

static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
{
	struct bkey *k = &j->uuid_bucket;

379
	if (__bch_btree_ptr_invalid(c, k))
K
Kent Overstreet 已提交
380 381 382
		return "bad uuid pointer";

	bkey_copy(&c->uuid_bucket, k);
383
	uuid_io(c, REQ_OP_READ, 0, k, cl);
K
Kent Overstreet 已提交
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420

	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
		struct uuid_entry_v0	*u0 = (void *) c->uuids;
		struct uuid_entry	*u1 = (void *) c->uuids;
		int i;

		closure_sync(cl);

		/*
		 * Since the new uuid entry is bigger than the old, we have to
		 * convert starting at the highest memory address and work down
		 * in order to do it in place
		 */

		for (i = c->nr_uuids - 1;
		     i >= 0;
		     --i) {
			memcpy(u1[i].uuid,	u0[i].uuid, 16);
			memcpy(u1[i].label,	u0[i].label, 32);

			u1[i].first_reg		= u0[i].first_reg;
			u1[i].last_reg		= u0[i].last_reg;
			u1[i].invalidated	= u0[i].invalidated;

			u1[i].flags	= 0;
			u1[i].sectors	= 0;
		}
	}

	return NULL;
}

static int __uuid_write(struct cache_set *c)
{
	BKEY_PADDED(key) k;
	struct closure cl;

421
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
422 423
	lockdep_assert_held(&bch_register_lock);

424
	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
K
Kent Overstreet 已提交
425 426 427
		return 1;

	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
M
Mike Christie 已提交
428
	uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
K
Kent Overstreet 已提交
429 430 431
	closure_sync(&cl);

	bkey_copy(&c->uuid_bucket, &k.key);
432
	bkey_put(c, &k.key);
K
Kent Overstreet 已提交
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
	return 0;
}

int bch_uuid_write(struct cache_set *c)
{
	int ret = __uuid_write(c);

	if (!ret)
		bch_journal_meta(c, NULL);

	return ret;
}

static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
{
	struct uuid_entry *u;

	for (u = c->uuids;
	     u < c->uuids + c->nr_uuids; u++)
		if (!memcmp(u->uuid, uuid, 16))
			return u;

	return NULL;
}

static struct uuid_entry *uuid_find_empty(struct cache_set *c)
{
	static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
461

K
Kent Overstreet 已提交
462 463 464 465 466 467 468
	return uuid_find(c, zero_uuid);
}

/*
 * Bucket priorities/gens:
 *
 * For each bucket, we store on disk its
C
Coly Li 已提交
469 470
 *   8 bit gen
 *  16 bit priority
K
Kent Overstreet 已提交
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
 *
 * See alloc.c for an explanation of the gen. The priority is used to implement
 * lru (and in the future other) cache replacement policies; for most purposes
 * it's just an opaque integer.
 *
 * The gens and the priorities don't have a whole lot to do with each other, and
 * it's actually the gens that must be written out at specific times - it's no
 * big deal if the priorities don't get written, if we lose them we just reuse
 * buckets in suboptimal order.
 *
 * On disk they're stored in a packed array, and in as many buckets are required
 * to fit them all. The buckets we use to store them form a list; the journal
 * header points to the first bucket, the first bucket points to the second
 * bucket, et cetera.
 *
 * This code is used by the allocation code; periodically (whenever it runs out
 * of buckets to allocate from) the allocation code will invalidate some
 * buckets, but it can't use those buckets until their new gens are safely on
 * disk.
 */

492
static void prio_endio(struct bio *bio)
K
Kent Overstreet 已提交
493 494 495
{
	struct cache *ca = bio->bi_private;

496
	cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
K
Kent Overstreet 已提交
497 498 499 500
	bch_bbio_free(bio, ca->set);
	closure_put(&ca->prio);
}

M
Mike Christie 已提交
501 502
static void prio_io(struct cache *ca, uint64_t bucket, int op,
		    unsigned long op_flags)
K
Kent Overstreet 已提交
503 504 505 506 507 508
{
	struct closure *cl = &ca->prio;
	struct bio *bio = bch_bbio_alloc(ca->set);

	closure_init_stack(cl);

509
	bio->bi_iter.bi_sector	= bucket * ca->sb.bucket_size;
510
	bio_set_dev(bio, ca->bdev);
511
	bio->bi_iter.bi_size	= bucket_bytes(ca);
K
Kent Overstreet 已提交
512 513 514

	bio->bi_end_io	= prio_endio;
	bio->bi_private = ca;
M
Mike Christie 已提交
515
	bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
516
	bch_bio_map(bio, ca->disk_buckets);
K
Kent Overstreet 已提交
517

518
	closure_bio_submit(ca->set, bio, &ca->prio);
K
Kent Overstreet 已提交
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	closure_sync(cl);
}

void bch_prio_write(struct cache *ca)
{
	int i;
	struct bucket *b;
	struct closure cl;

	closure_init_stack(&cl);

	lockdep_assert_held(&ca->set->bucket_lock);

	ca->disk_buckets->seq++;

	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
			&ca->meta_sectors_written);

537 538
	//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
	//	 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
K
Kent Overstreet 已提交
539 540 541 542

	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
		long bucket;
		struct prio_set *p = ca->disk_buckets;
K
Kent Overstreet 已提交
543 544
		struct bucket_disk *d = p->data;
		struct bucket_disk *end = d + prios_per_bucket(ca);
K
Kent Overstreet 已提交
545 546 547 548 549 550 551 552 553

		for (b = ca->buckets + i * prios_per_bucket(ca);
		     b < ca->buckets + ca->sb.nbuckets && d < end;
		     b++, d++) {
			d->prio = cpu_to_le16(b->prio);
			d->gen = b->gen;
		}

		p->next_bucket	= ca->prio_buckets[i + 1];
554
		p->magic	= pset_magic(&ca->sb);
555
		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
K
Kent Overstreet 已提交
556

557
		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
K
Kent Overstreet 已提交
558 559 560
		BUG_ON(bucket == -1);

		mutex_unlock(&ca->set->bucket_lock);
M
Mike Christie 已提交
561
		prio_io(ca, bucket, REQ_OP_WRITE, 0);
K
Kent Overstreet 已提交
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
		mutex_lock(&ca->set->bucket_lock);

		ca->prio_buckets[i] = bucket;
		atomic_dec_bug(&ca->buckets[bucket].pin);
	}

	mutex_unlock(&ca->set->bucket_lock);

	bch_journal_meta(ca->set, &cl);
	closure_sync(&cl);

	mutex_lock(&ca->set->bucket_lock);

	/*
	 * Don't want the old priorities to get garbage collected until after we
	 * finish writing the new ones, and they're journalled
	 */
K
Kent Overstreet 已提交
579 580 581 582 583
	for (i = 0; i < prio_buckets(ca); i++) {
		if (ca->prio_last_buckets[i])
			__bch_bucket_free(ca,
				&ca->buckets[ca->prio_last_buckets[i]]);

K
Kent Overstreet 已提交
584
		ca->prio_last_buckets[i] = ca->prio_buckets[i];
K
Kent Overstreet 已提交
585
	}
K
Kent Overstreet 已提交
586 587 588 589 590 591 592
}

static void prio_read(struct cache *ca, uint64_t bucket)
{
	struct prio_set *p = ca->disk_buckets;
	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
	struct bucket *b;
593
	unsigned int bucket_nr = 0;
K
Kent Overstreet 已提交
594 595 596 597 598 599 600 601 602

	for (b = ca->buckets;
	     b < ca->buckets + ca->sb.nbuckets;
	     b++, d++) {
		if (d == end) {
			ca->prio_buckets[bucket_nr] = bucket;
			ca->prio_last_buckets[bucket_nr] = bucket;
			bucket_nr++;

603
			prio_io(ca, bucket, REQ_OP_READ, 0);
K
Kent Overstreet 已提交
604

605 606
			if (p->csum !=
			    bch_crc64(&p->magic, bucket_bytes(ca) - 8))
K
Kent Overstreet 已提交
607 608
				pr_warn("bad csum reading priorities");

609
			if (p->magic != pset_magic(&ca->sb))
K
Kent Overstreet 已提交
610 611 612 613 614 615 616
				pr_warn("bad magic reading priorities");

			bucket = p->next_bucket;
			d = p->data;
		}

		b->prio = le16_to_cpu(d->prio);
K
Kent Overstreet 已提交
617
		b->gen = b->last_gc = d->gen;
K
Kent Overstreet 已提交
618 619 620 621 622 623 624 625
	}
}

/* Bcache device */

static int open_dev(struct block_device *b, fmode_t mode)
{
	struct bcache_device *d = b->bd_disk->private_data;
626

627
	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
K
Kent Overstreet 已提交
628 629 630 631 632 633
		return -ENXIO;

	closure_get(&d->cl);
	return 0;
}

634
static void release_dev(struct gendisk *b, fmode_t mode)
K
Kent Overstreet 已提交
635 636
{
	struct bcache_device *d = b->private_data;
637

K
Kent Overstreet 已提交
638 639 640 641 642 643 644
	closure_put(&d->cl);
}

static int ioctl_dev(struct block_device *b, fmode_t mode,
		     unsigned int cmd, unsigned long arg)
{
	struct bcache_device *d = b->bd_disk->private_data;
645 646 647 648 649
	struct cached_dev *dc = container_of(d, struct cached_dev, disk);

	if (dc->io_disable)
		return -EIO;

K
Kent Overstreet 已提交
650 651 652 653 654 655 656 657 658 659 660 661
	return d->ioctl(d, mode, cmd, arg);
}

static const struct block_device_operations bcache_ops = {
	.open		= open_dev,
	.release	= release_dev,
	.ioctl		= ioctl_dev,
	.owner		= THIS_MODULE,
};

void bcache_device_stop(struct bcache_device *d)
{
662
	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
K
Kent Overstreet 已提交
663 664 665
		closure_queue(&d->cl);
}

666 667
static void bcache_device_unlink(struct bcache_device *d)
{
668
	lockdep_assert_held(&bch_register_lock);
669

670
	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
671
		unsigned int i;
672
		struct cache *ca;
673

674 675 676 677 678 679
		sysfs_remove_link(&d->c->kobj, d->name);
		sysfs_remove_link(&d->kobj, "cache");

		for_each_cache(ca, d->c, i)
			bd_unlink_disk_holder(ca->bdev, d->disk);
	}
680 681 682 683 684
}

static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
			       const char *name)
{
685
	unsigned int i;
686 687 688 689 690 691 692 693 694 695 696
	struct cache *ca;

	for_each_cache(ca, d->c, i)
		bd_link_disk_holder(ca->bdev, d->disk);

	snprintf(d->name, BCACHEDEVNAME_SIZE,
		 "%s%u", name, d->id);

	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
	     "Couldn't create device <-> cache set symlinks");
697 698

	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
699 700
}

K
Kent Overstreet 已提交
701 702 703 704
static void bcache_device_detach(struct bcache_device *d)
{
	lockdep_assert_held(&bch_register_lock);

705 706
	atomic_dec(&d->c->attached_dev_nr);

707
	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
K
Kent Overstreet 已提交
708 709 710 711
		struct uuid_entry *u = d->c->uuids + d->id;

		SET_UUID_FLASH_ONLY(u, 0);
		memcpy(u->uuid, invalid_uuid, 16);
712
		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
K
Kent Overstreet 已提交
713 714 715
		bch_uuid_write(d->c);
	}

716
	bcache_device_unlink(d);
717

K
Kent Overstreet 已提交
718 719 720 721 722 723
	d->c->devices[d->id] = NULL;
	closure_put(&d->c->caching);
	d->c = NULL;
}

static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
724
				 unsigned int id)
K
Kent Overstreet 已提交
725 726 727 728 729
{
	d->id = id;
	d->c = c;
	c->devices[id] = d;

730 731 732
	if (id >= c->devices_max_used)
		c->devices_max_used = id + 1;

K
Kent Overstreet 已提交
733 734 735
	closure_get(&c->caching);
}

736 737 738 739 740 741 742 743 744 745
static inline int first_minor_to_idx(int first_minor)
{
	return (first_minor/BCACHE_MINORS);
}

static inline int idx_to_first_minor(int idx)
{
	return (idx * BCACHE_MINORS);
}

K
Kent Overstreet 已提交
746 747 748 749 750 751 752 753
static void bcache_device_free(struct bcache_device *d)
{
	lockdep_assert_held(&bch_register_lock);

	pr_info("%s stopped", d->disk->disk_name);

	if (d->c)
		bcache_device_detach(d);
754
	if (d->disk && d->disk->flags & GENHD_FL_UP)
K
Kent Overstreet 已提交
755 756 757
		del_gendisk(d->disk);
	if (d->disk && d->disk->queue)
		blk_cleanup_queue(d->disk->queue);
758
	if (d->disk) {
759 760
		ida_simple_remove(&bcache_device_idx,
				  first_minor_to_idx(d->disk->first_minor));
K
Kent Overstreet 已提交
761
		put_disk(d->disk);
762
	}
K
Kent Overstreet 已提交
763

764
	bioset_exit(&d->bio_split);
765 766
	kvfree(d->full_dirty_stripes);
	kvfree(d->stripe_sectors_dirty);
K
Kent Overstreet 已提交
767 768 769 770

	closure_debug_destroy(&d->cl);
}

771
static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
772
			      sector_t sectors)
K
Kent Overstreet 已提交
773 774
{
	struct request_queue *q;
775 776
	const size_t max_stripes = min_t(size_t, INT_MAX,
					 SIZE_MAX / sizeof(atomic_t));
777
	size_t n;
778
	int idx;
779

780 781
	if (!d->stripe_size)
		d->stripe_size = 1 << 31;
782

783
	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
784

785
	if (!d->nr_stripes || d->nr_stripes > max_stripes) {
786
		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
787
			(unsigned int)d->nr_stripes);
788
		return -ENOMEM;
789
	}
790 791

	n = d->nr_stripes * sizeof(atomic_t);
792
	d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
793 794
	if (!d->stripe_sectors_dirty)
		return -ENOMEM;
K
Kent Overstreet 已提交
795

796
	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
797
	d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
798 799 800
	if (!d->full_dirty_stripes)
		return -ENOMEM;

801 802 803 804
	idx = ida_simple_get(&bcache_device_idx, 0,
				BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
	if (idx < 0)
		return idx;
805

806
	if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
807 808 809 810 811 812
			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
		goto err;

	d->disk = alloc_disk(BCACHE_MINORS);
	if (!d->disk)
		goto err;
K
Kent Overstreet 已提交
813

814
	set_capacity(d->disk, sectors);
815
	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
K
Kent Overstreet 已提交
816 817

	d->disk->major		= bcache_major;
818
	d->disk->first_minor	= idx_to_first_minor(idx);
K
Kent Overstreet 已提交
819 820 821
	d->disk->fops		= &bcache_ops;
	d->disk->private_data	= d;

822 823 824 825
	q = blk_alloc_queue(GFP_KERNEL);
	if (!q)
		return -ENOMEM;

K
Kent Overstreet 已提交
826 827 828
	blk_queue_make_request(q, NULL);
	d->disk->queue			= q;
	q->queuedata			= d;
829
	q->backing_dev_info->congested_data = d;
K
Kent Overstreet 已提交
830 831 832 833
	q->limits.max_hw_sectors	= UINT_MAX;
	q->limits.max_sectors		= UINT_MAX;
	q->limits.max_segment_size	= UINT_MAX;
	q->limits.max_segments		= BIO_MAX_PAGES;
834
	blk_queue_max_discard_sectors(q, UINT_MAX);
835
	q->limits.discard_granularity	= 512;
K
Kent Overstreet 已提交
836 837 838
	q->limits.io_min		= block_size;
	q->limits.logical_block_size	= block_size;
	q->limits.physical_block_size	= block_size;
839 840 841
	blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
	blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
K
Kent Overstreet 已提交
842

843
	blk_queue_write_cache(q, true, true);
844

K
Kent Overstreet 已提交
845
	return 0;
846 847 848 849 850

err:
	ida_simple_remove(&bcache_device_idx, idx);
	return -ENOMEM;

K
Kent Overstreet 已提交
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
}

/* Cached device */

static void calc_cached_dev_sectors(struct cache_set *c)
{
	uint64_t sectors = 0;
	struct cached_dev *dc;

	list_for_each_entry(dc, &c->cached_devs, list)
		sectors += bdev_sectors(dc->bdev);

	c->cached_dev_sectors = sectors;
}

866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
#define BACKING_DEV_OFFLINE_TIMEOUT 5
static int cached_dev_status_update(void *arg)
{
	struct cached_dev *dc = arg;
	struct request_queue *q;

	/*
	 * If this delayed worker is stopping outside, directly quit here.
	 * dc->io_disable might be set via sysfs interface, so check it
	 * here too.
	 */
	while (!kthread_should_stop() && !dc->io_disable) {
		q = bdev_get_queue(dc->bdev);
		if (blk_queue_dying(q))
			dc->offline_seconds++;
		else
			dc->offline_seconds = 0;

		if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
			pr_err("%s: device offline for %d seconds",
			       dc->backing_dev_name,
			       BACKING_DEV_OFFLINE_TIMEOUT);
			pr_err("%s: disable I/O request due to backing "
			       "device offline", dc->disk.name);
			dc->io_disable = true;
			/* let others know earlier that io_disable is true */
			smp_mb();
			bcache_device_stop(&dc->disk);
			break;
		}
		schedule_timeout_interruptible(HZ);
	}

	wait_for_kthread_stop();
	return 0;
}


K
Kent Overstreet 已提交
904 905 906
void bch_cached_dev_run(struct cached_dev *dc)
{
	struct bcache_device *d = &dc->disk;
G
Gabriel de Perthuis 已提交
907
	char buf[SB_LABEL_SIZE + 1];
908 909 910
	char *env[] = {
		"DRIVER=bcache",
		kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
G
Gabriel de Perthuis 已提交
911 912
		NULL,
		NULL,
913
	};
K
Kent Overstreet 已提交
914

G
Gabriel de Perthuis 已提交
915 916 917 918
	memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
	buf[SB_LABEL_SIZE] = '\0';
	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);

919 920 921
	if (atomic_xchg(&dc->running, 1)) {
		kfree(env[1]);
		kfree(env[2]);
K
Kent Overstreet 已提交
922
		return;
923
	}
K
Kent Overstreet 已提交
924 925 926 927

	if (!d->c &&
	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
		struct closure cl;
928

K
Kent Overstreet 已提交
929 930 931 932 933 934 935 936
		closure_init_stack(&cl);

		SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
		bch_write_bdev_super(dc, &cl);
		closure_sync(&cl);
	}

	add_disk(d->disk);
937
	bd_link_disk_holder(dc->bdev, dc->disk.disk);
C
Coly Li 已提交
938 939 940 941
	/*
	 * won't show up in the uevent file, use udevadm monitor -e instead
	 * only class / kset properties are persistent
	 */
K
Kent Overstreet 已提交
942
	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
943
	kfree(env[1]);
G
Gabriel de Perthuis 已提交
944
	kfree(env[2]);
945

K
Kent Overstreet 已提交
946 947 948
	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
		pr_debug("error creating sysfs link");
949 950 951 952 953 954 955 956

	dc->status_update_thread = kthread_run(cached_dev_status_update,
					       dc, "bcache_status_update");
	if (IS_ERR(dc->status_update_thread)) {
		pr_warn("failed to create bcache_status_update kthread, "
			"continue to run without monitoring backing "
			"device status");
	}
K
Kent Overstreet 已提交
957 958
}

959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
/*
 * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
 * work dc->writeback_rate_update is running. Wait until the routine
 * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
 * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
 * seconds, give up waiting here and continue to cancel it too.
 */
static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
{
	int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;

	do {
		if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
			      &dc->disk.flags))
			break;
		time_out--;
		schedule_timeout_interruptible(1);
	} while (time_out > 0);

	if (time_out == 0)
		pr_warn("give up waiting for dc->writeback_write_update to quit");

	cancel_delayed_work_sync(&dc->writeback_rate_update);
}

K
Kent Overstreet 已提交
984 985 986 987
static void cached_dev_detach_finish(struct work_struct *w)
{
	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
	struct closure cl;
988

K
Kent Overstreet 已提交
989 990
	closure_init_stack(&cl);

991
	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
992
	BUG_ON(refcount_read(&dc->count));
K
Kent Overstreet 已提交
993 994 995

	mutex_lock(&bch_register_lock);

996 997 998
	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
		cancel_writeback_rate_update_dwork(dc);

999 1000 1001 1002 1003
	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
		kthread_stop(dc->writeback_thread);
		dc->writeback_thread = NULL;
	}

K
Kent Overstreet 已提交
1004 1005 1006 1007 1008 1009 1010 1011 1012
	memset(&dc->sb.set_uuid, 0, 16);
	SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);

	bch_write_bdev_super(dc, &cl);
	closure_sync(&cl);

	bcache_device_detach(&dc->disk);
	list_move(&dc->list, &uncached_devices);

1013
	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1014
	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1015

K
Kent Overstreet 已提交
1016 1017
	mutex_unlock(&bch_register_lock);

1018
	pr_info("Caching disabled for %s", dc->backing_dev_name);
K
Kent Overstreet 已提交
1019 1020 1021 1022 1023 1024 1025 1026 1027

	/* Drop ref we took in cached_dev_detach() */
	closure_put(&dc->disk.cl);
}

void bch_cached_dev_detach(struct cached_dev *dc)
{
	lockdep_assert_held(&bch_register_lock);

1028
	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
K
Kent Overstreet 已提交
1029 1030
		return;

1031
	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
K
Kent Overstreet 已提交
1032 1033 1034 1035 1036 1037 1038 1039 1040
		return;

	/*
	 * Block the device from being closed and freed until we're finished
	 * detaching
	 */
	closure_get(&dc->disk.cl);

	bch_writeback_queue(dc);
1041

K
Kent Overstreet 已提交
1042 1043 1044
	cached_dev_put(dc);
}

1045 1046
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
			  uint8_t *set_uuid)
K
Kent Overstreet 已提交
1047
{
1048
	uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
K
Kent Overstreet 已提交
1049
	struct uuid_entry *u;
1050
	struct cached_dev *exist_dc, *t;
K
Kent Overstreet 已提交
1051

1052 1053
	if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
	    (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
K
Kent Overstreet 已提交
1054 1055 1056
		return -ENOENT;

	if (dc->disk.c) {
1057 1058
		pr_err("Can't attach %s: already attached",
		       dc->backing_dev_name);
K
Kent Overstreet 已提交
1059 1060 1061 1062
		return -EINVAL;
	}

	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1063 1064
		pr_err("Can't attach %s: shutting down",
		       dc->backing_dev_name);
K
Kent Overstreet 已提交
1065 1066 1067 1068 1069
		return -EINVAL;
	}

	if (dc->sb.block_size < c->sb.block_size) {
		/* Will die */
K
Kent Overstreet 已提交
1070
		pr_err("Couldn't attach %s: block size less than set's block size",
1071
		       dc->backing_dev_name);
K
Kent Overstreet 已提交
1072 1073 1074
		return -EINVAL;
	}

1075 1076 1077 1078
	/* Check whether already attached */
	list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
		if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
			pr_err("Tried to attach %s but duplicate UUID already attached",
1079
				dc->backing_dev_name);
1080 1081 1082 1083 1084

			return -EINVAL;
		}
	}

K
Kent Overstreet 已提交
1085 1086 1087 1088 1089 1090
	u = uuid_find(c, dc->sb.uuid);

	if (u &&
	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
		memcpy(u->uuid, invalid_uuid, 16);
1091
		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
K
Kent Overstreet 已提交
1092 1093 1094 1095 1096
		u = NULL;
	}

	if (!u) {
		if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1097 1098
			pr_err("Couldn't find uuid for %s in set",
			       dc->backing_dev_name);
K
Kent Overstreet 已提交
1099 1100 1101 1102 1103
			return -ENOENT;
		}

		u = uuid_find_empty(c);
		if (!u) {
1104 1105
			pr_err("Not caching %s, no room for UUID",
			       dc->backing_dev_name);
K
Kent Overstreet 已提交
1106 1107 1108 1109
			return -EINVAL;
		}
	}

C
Coly Li 已提交
1110 1111 1112
	/*
	 * Deadlocks since we're called via sysfs...
	 * sysfs_remove_file(&dc->kobj, &sysfs_attach);
K
Kent Overstreet 已提交
1113 1114
	 */

1115
	if (bch_is_zero(u->uuid, 16)) {
K
Kent Overstreet 已提交
1116
		struct closure cl;
1117

K
Kent Overstreet 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
		closure_init_stack(&cl);

		memcpy(u->uuid, dc->sb.uuid, 16);
		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
		u->first_reg = u->last_reg = rtime;
		bch_uuid_write(c);

		memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);

		bch_write_bdev_super(dc, &cl);
		closure_sync(&cl);
	} else {
		u->last_reg = rtime;
		bch_uuid_write(c);
	}

	bcache_device_attach(&dc->disk, c, u - c->uuids);
	list_move(&dc->list, &c->cached_devs);
	calc_cached_dev_sectors(c);

	/*
	 * dc->c must be set before dc->count != 0 - paired with the mb in
	 * cached_dev_get()
	 */
1143
	smp_wmb();
1144
	refcount_set(&dc->count, 1);
K
Kent Overstreet 已提交
1145

1146 1147 1148 1149
	/* Block writeback thread, but spawn it */
	down_write(&dc->writeback_lock);
	if (bch_cached_dev_writeback_start(dc)) {
		up_write(&dc->writeback_lock);
1150
		return -ENOMEM;
1151
	}
1152

K
Kent Overstreet 已提交
1153
	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1154
		bch_sectors_dirty_init(&dc->disk);
K
Kent Overstreet 已提交
1155 1156 1157 1158 1159
		atomic_set(&dc->has_dirty, 1);
		bch_writeback_queue(dc);
	}

	bch_cached_dev_run(dc);
1160
	bcache_device_link(&dc->disk, c, "bdev");
1161
	atomic_inc(&c->attached_dev_nr);
K
Kent Overstreet 已提交
1162

1163 1164 1165
	/* Allow the writeback thread to proceed */
	up_write(&dc->writeback_lock);

K
Kent Overstreet 已提交
1166
	pr_info("Caching %s as %s on set %pU",
1167 1168
		dc->backing_dev_name,
		dc->disk.disk->disk_name,
K
Kent Overstreet 已提交
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
		dc->disk.c->sb.set_uuid);
	return 0;
}

void bch_cached_dev_release(struct kobject *kobj)
{
	struct cached_dev *dc = container_of(kobj, struct cached_dev,
					     disk.kobj);
	kfree(dc);
	module_put(THIS_MODULE);
}

static void cached_dev_free(struct closure *cl)
{
	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);

1185 1186 1187 1188 1189
	mutex_lock(&bch_register_lock);

	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
		cancel_writeback_rate_update_dwork(dc);

1190 1191
	if (!IS_ERR_OR_NULL(dc->writeback_thread))
		kthread_stop(dc->writeback_thread);
1192 1193
	if (dc->writeback_write_wq)
		destroy_workqueue(dc->writeback_write_wq);
1194 1195
	if (!IS_ERR_OR_NULL(dc->status_update_thread))
		kthread_stop(dc->status_update_thread);
K
Kent Overstreet 已提交
1196

1197 1198
	if (atomic_read(&dc->running))
		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
K
Kent Overstreet 已提交
1199 1200 1201 1202 1203
	bcache_device_free(&dc->disk);
	list_del(&dc->list);

	mutex_unlock(&bch_register_lock);

1204
	if (!IS_ERR_OR_NULL(dc->bdev))
K
Kent Overstreet 已提交
1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);

	wake_up(&unregister_wait);

	kobject_put(&dc->disk.kobj);
}

static void cached_dev_flush(struct closure *cl)
{
	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
	struct bcache_device *d = &dc->disk;

1217
	mutex_lock(&bch_register_lock);
1218
	bcache_device_unlink(d);
1219 1220
	mutex_unlock(&bch_register_lock);

K
Kent Overstreet 已提交
1221 1222 1223 1224 1225 1226
	bch_cache_accounting_destroy(&dc->accounting);
	kobject_del(&d->kobj);

	continue_at(cl, cached_dev_free, system_wq);
}

1227
static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
K
Kent Overstreet 已提交
1228
{
1229
	int ret;
K
Kent Overstreet 已提交
1230
	struct io *io;
1231
	struct request_queue *q = bdev_get_queue(dc->bdev);
K
Kent Overstreet 已提交
1232 1233 1234

	__module_get(THIS_MODULE);
	INIT_LIST_HEAD(&dc->list);
1235 1236
	closure_init(&dc->disk.cl, NULL);
	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
K
Kent Overstreet 已提交
1237 1238
	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
	INIT_WORK(&dc->detach, cached_dev_detach_finish);
1239
	sema_init(&dc->sb_write_mutex, 1);
1240 1241 1242
	INIT_LIST_HEAD(&dc->io_lru);
	spin_lock_init(&dc->io_lock);
	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
K
Kent Overstreet 已提交
1243 1244 1245 1246 1247 1248 1249 1250

	dc->sequential_cutoff		= 4 << 20;

	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
		list_add(&io->lru, &dc->io_lru);
		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
	}

1251 1252 1253 1254 1255 1256
	dc->disk.stripe_size = q->limits.io_opt >> 9;

	if (dc->disk.stripe_size)
		dc->partial_stripes_expensive =
			q->limits.raid_partial_stripes_expensive;

1257 1258
	ret = bcache_device_init(&dc->disk, block_size,
			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1259 1260 1261
	if (ret)
		return ret;

1262 1263 1264
	dc->disk.disk->queue->backing_dev_info->ra_pages =
		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
		    q->backing_dev_info->ra_pages);
1265

1266 1267 1268
	atomic_set(&dc->io_errors, 0);
	dc->io_disable = false;
	dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1269 1270 1271
	/* default to auto */
	dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;

1272 1273
	bch_cached_dev_request_init(dc);
	bch_cached_dev_writeback_init(dc);
K
Kent Overstreet 已提交
1274 1275 1276 1277 1278
	return 0;
}

/* Cached device - bcache superblock */

1279
static void register_bdev(struct cache_sb *sb, struct page *sb_page,
K
Kent Overstreet 已提交
1280 1281 1282 1283 1284 1285
				 struct block_device *bdev,
				 struct cached_dev *dc)
{
	const char *err = "cannot allocate memory";
	struct cache_set *c;

1286
	bdevname(bdev, dc->backing_dev_name);
K
Kent Overstreet 已提交
1287 1288 1289 1290
	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
	dc->bdev = bdev;
	dc->bdev->bd_holder = dc;

1291
	bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1292
	bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1293
	get_page(sb_page);
1294

1295

1296 1297
	if (cached_dev_init(dc, sb->block_size << 9))
		goto err;
K
Kent Overstreet 已提交
1298 1299 1300 1301 1302 1303 1304 1305

	err = "error creating kobject";
	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
			"bcache"))
		goto err;
	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
		goto err;

1306
	pr_info("registered backing device %s", dc->backing_dev_name);
1307

K
Kent Overstreet 已提交
1308
	list_add(&dc->list, &uncached_devices);
C
Coly Li 已提交
1309
	/* attach to a matched cache set if it exists */
K
Kent Overstreet 已提交
1310
	list_for_each_entry(c, &bch_cache_sets, list)
1311
		bch_cached_dev_attach(dc, c, NULL);
K
Kent Overstreet 已提交
1312 1313 1314 1315 1316

	if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
		bch_cached_dev_run(dc);

1317
	return;
K
Kent Overstreet 已提交
1318
err:
1319
	pr_notice("error %s: %s", dc->backing_dev_name, err);
1320
	bcache_device_stop(&dc->disk);
K
Kent Overstreet 已提交
1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
}

/* Flash only volumes */

void bch_flash_dev_release(struct kobject *kobj)
{
	struct bcache_device *d = container_of(kobj, struct bcache_device,
					       kobj);
	kfree(d);
}

static void flash_dev_free(struct closure *cl)
{
	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1335

1336
	mutex_lock(&bch_register_lock);
1337 1338
	atomic_long_sub(bcache_dev_sectors_dirty(d),
			&d->c->flash_dev_dirty_sectors);
K
Kent Overstreet 已提交
1339
	bcache_device_free(d);
1340
	mutex_unlock(&bch_register_lock);
K
Kent Overstreet 已提交
1341 1342 1343 1344 1345 1346 1347
	kobject_put(&d->kobj);
}

static void flash_dev_flush(struct closure *cl)
{
	struct bcache_device *d = container_of(cl, struct bcache_device, cl);

1348
	mutex_lock(&bch_register_lock);
1349
	bcache_device_unlink(d);
1350
	mutex_unlock(&bch_register_lock);
K
Kent Overstreet 已提交
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
	kobject_del(&d->kobj);
	continue_at(cl, flash_dev_free, system_wq);
}

static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
{
	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
					  GFP_KERNEL);
	if (!d)
		return -ENOMEM;

	closure_init(&d->cl, NULL);
	set_closure_fn(&d->cl, flash_dev_flush, system_wq);

	kobject_init(&d->kobj, &bch_flash_dev_ktype);

1367
	if (bcache_device_init(d, block_bytes(c), u->sectors))
K
Kent Overstreet 已提交
1368 1369 1370
		goto err;

	bcache_device_attach(d, c, u - c->uuids);
1371
	bch_sectors_dirty_init(d);
K
Kent Overstreet 已提交
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	bch_flash_dev_request_init(d);
	add_disk(d->disk);

	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
		goto err;

	bcache_device_link(d, c, "volume");

	return 0;
err:
	kobject_put(&d->kobj);
	return -ENOMEM;
}

static int flash_devs_run(struct cache_set *c)
{
	int ret = 0;
	struct uuid_entry *u;

	for (u = c->uuids;
1392
	     u < c->uuids + c->nr_uuids && !ret;
K
Kent Overstreet 已提交
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
	     u++)
		if (UUID_FLASH_ONLY(u))
			ret = flash_dev_run(c, u);

	return ret;
}

int bch_flash_dev_create(struct cache_set *c, uint64_t size)
{
	struct uuid_entry *u;

	if (test_bit(CACHE_SET_STOPPING, &c->flags))
		return -EINTR;

1407 1408 1409
	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
		return -EPERM;

K
Kent Overstreet 已提交
1410 1411 1412 1413 1414 1415 1416 1417
	u = uuid_find_empty(c);
	if (!u) {
		pr_err("Can't create volume, no room for UUID");
		return -EINVAL;
	}

	get_random_bytes(u->uuid, 16);
	memset(u->label, 0, 32);
1418
	u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
K
Kent Overstreet 已提交
1419 1420 1421 1422 1423 1424 1425 1426 1427

	SET_UUID_FLASH_ONLY(u, 1);
	u->sectors = size >> 9;

	bch_uuid_write(c);

	return flash_dev_run(c, u);
}

1428 1429
bool bch_cached_dev_error(struct cached_dev *dc)
{
1430 1431
	struct cache_set *c;

1432 1433 1434 1435 1436 1437 1438 1439
	if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
		return false;

	dc->io_disable = true;
	/* make others know io_disable is true earlier */
	smp_mb();

	pr_err("stop %s: too many IO errors on backing device %s\n",
1440
		dc->disk.disk->disk_name, dc->backing_dev_name);
1441

1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
	/*
	 * If the cached device is still attached to a cache set,
	 * even dc->io_disable is true and no more I/O requests
	 * accepted, cache device internal I/O (writeback scan or
	 * garbage collection) may still prevent bcache device from
	 * being stopped. So here CACHE_SET_IO_DISABLE should be
	 * set to c->flags too, to make the internal I/O to cache
	 * device rejected and stopped immediately.
	 * If c is NULL, that means the bcache device is not attached
	 * to any cache set, then no CACHE_SET_IO_DISABLE bit to set.
	 */
	c = dc->disk.c;
	if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
		pr_info("CACHE_SET_IO_DISABLE already set");

1457 1458 1459 1460
	bcache_device_stop(&dc->disk);
	return true;
}

K
Kent Overstreet 已提交
1461 1462 1463 1464 1465 1466 1467
/* Cache set */

__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
{
	va_list args;

1468 1469
	if (c->on_error != ON_ERROR_PANIC &&
	    test_bit(CACHE_SET_STOPPING, &c->flags))
K
Kent Overstreet 已提交
1470 1471
		return false;

1472
	if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1473
		pr_info("CACHE_SET_IO_DISABLE already set");
1474

C
Coly Li 已提交
1475 1476 1477 1478
	/*
	 * XXX: we can be called from atomic context
	 * acquire_console_sem();
	 */
K
Kent Overstreet 已提交
1479

1480
	pr_err("bcache: error on %pU: ", c->sb.set_uuid);
K
Kent Overstreet 已提交
1481 1482 1483 1484 1485

	va_start(args, fmt);
	vprintk(fmt, args);
	va_end(args);

1486
	pr_err(", disabling caching\n");
K
Kent Overstreet 已提交
1487

1488 1489 1490
	if (c->on_error == ON_ERROR_PANIC)
		panic("panic forced after error\n");

K
Kent Overstreet 已提交
1491 1492 1493 1494 1495 1496 1497
	bch_cache_set_unregister(c);
	return true;
}

void bch_cache_set_release(struct kobject *kobj)
{
	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1498

K
Kent Overstreet 已提交
1499 1500 1501 1502 1503 1504 1505 1506
	kfree(c);
	module_put(THIS_MODULE);
}

static void cache_set_free(struct closure *cl)
{
	struct cache_set *c = container_of(cl, struct cache_set, cl);
	struct cache *ca;
1507
	unsigned int i;
K
Kent Overstreet 已提交
1508 1509 1510 1511 1512 1513 1514 1515 1516

	if (!IS_ERR_OR_NULL(c->debug))
		debugfs_remove(c->debug);

	bch_open_buckets_free(c);
	bch_btree_cache_free(c);
	bch_journal_free(c);

	for_each_cache(ca, c, i)
1517 1518 1519
		if (ca) {
			ca->set = NULL;
			c->cache[ca->sb.nr_this_dev] = NULL;
K
Kent Overstreet 已提交
1520
			kobject_put(&ca->kobj);
1521
		}
K
Kent Overstreet 已提交
1522

1523
	bch_bset_sort_state_free(&c->sort);
K
Kent Overstreet 已提交
1524 1525
	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));

1526 1527
	if (c->moving_gc_wq)
		destroy_workqueue(c->moving_gc_wq);
1528 1529 1530 1531
	bioset_exit(&c->bio_split);
	mempool_exit(&c->fill_iter);
	mempool_exit(&c->bio_meta);
	mempool_exit(&c->search);
K
Kent Overstreet 已提交
1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
	kfree(c->devices);

	mutex_lock(&bch_register_lock);
	list_del(&c->list);
	mutex_unlock(&bch_register_lock);

	pr_info("Cache set %pU unregistered", c->sb.set_uuid);
	wake_up(&unregister_wait);

	closure_debug_destroy(&c->cl);
	kobject_put(&c->kobj);
}

static void cache_set_flush(struct closure *cl)
{
	struct cache_set *c = container_of(cl, struct cache_set, caching);
1548
	struct cache *ca;
K
Kent Overstreet 已提交
1549
	struct btree *b;
1550
	unsigned int i;
K
Kent Overstreet 已提交
1551 1552 1553 1554 1555 1556

	bch_cache_accounting_destroy(&c->accounting);

	kobject_put(&c->internal);
	kobject_del(&c->kobj);

K
Kent Overstreet 已提交
1557 1558 1559
	if (c->gc_thread)
		kthread_stop(c->gc_thread);

K
Kent Overstreet 已提交
1560 1561 1562 1563
	if (!IS_ERR_OR_NULL(c->root))
		list_add(&c->root->list, &c->btree_cache);

	/* Should skip this if we're unregistering because of an error */
K
Kent Overstreet 已提交
1564 1565
	list_for_each_entry(b, &c->btree_cache, list) {
		mutex_lock(&b->write_lock);
K
Kent Overstreet 已提交
1566
		if (btree_node_dirty(b))
K
Kent Overstreet 已提交
1567 1568 1569
			__bch_btree_node_write(b, NULL);
		mutex_unlock(&b->write_lock);
	}
K
Kent Overstreet 已提交
1570

1571 1572 1573 1574
	for_each_cache(ca, c, i)
		if (ca->alloc_thread)
			kthread_stop(ca->alloc_thread);

1575 1576 1577 1578 1579
	if (c->journal.cur) {
		cancel_delayed_work_sync(&c->journal.work);
		/* flush last journal entry if needed */
		c->journal.work.work.func(&c->journal.work.work);
	}
K
Kent Overstreet 已提交
1580

K
Kent Overstreet 已提交
1581 1582 1583
	closure_return(cl);
}

1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614
/*
 * This function is only called when CACHE_SET_IO_DISABLE is set, which means
 * cache set is unregistering due to too many I/O errors. In this condition,
 * the bcache device might be stopped, it depends on stop_when_cache_set_failed
 * value and whether the broken cache has dirty data:
 *
 * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
 *  BCH_CACHED_STOP_AUTO               0               NO
 *  BCH_CACHED_STOP_AUTO               1               YES
 *  BCH_CACHED_DEV_STOP_ALWAYS         0               YES
 *  BCH_CACHED_DEV_STOP_ALWAYS         1               YES
 *
 * The expected behavior is, if stop_when_cache_set_failed is configured to
 * "auto" via sysfs interface, the bcache device will not be stopped if the
 * backing device is clean on the broken cache device.
 */
static void conditional_stop_bcache_device(struct cache_set *c,
					   struct bcache_device *d,
					   struct cached_dev *dc)
{
	if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
		pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
			d->disk->disk_name, c->sb.set_uuid);
		bcache_device_stop(d);
	} else if (atomic_read(&dc->has_dirty)) {
		/*
		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
		 * and dc->has_dirty == 1
		 */
		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
			d->disk->disk_name);
1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
			/*
			 * There might be a small time gap that cache set is
			 * released but bcache device is not. Inside this time
			 * gap, regular I/O requests will directly go into
			 * backing device as no cache set attached to. This
			 * behavior may also introduce potential inconsistence
			 * data in writeback mode while cache is dirty.
			 * Therefore before calling bcache_device_stop() due
			 * to a broken cache device, dc->io_disable should be
			 * explicitly set to true.
			 */
			dc->io_disable = true;
			/* make others know io_disable is true earlier */
			smp_mb();
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639
			bcache_device_stop(d);
	} else {
		/*
		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
		 * and dc->has_dirty == 0
		 */
		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
			d->disk->disk_name);
	}
}

K
Kent Overstreet 已提交
1640 1641 1642
static void __cache_set_unregister(struct closure *cl)
{
	struct cache_set *c = container_of(cl, struct cache_set, caching);
K
Kent Overstreet 已提交
1643
	struct cached_dev *dc;
1644
	struct bcache_device *d;
K
Kent Overstreet 已提交
1645 1646 1647 1648
	size_t i;

	mutex_lock(&bch_register_lock);

1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661
	for (i = 0; i < c->devices_max_used; i++) {
		d = c->devices[i];
		if (!d)
			continue;

		if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
			dc = container_of(d, struct cached_dev, disk);
			bch_cached_dev_detach(dc);
			if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
				conditional_stop_bcache_device(c, d, dc);
		} else {
			bcache_device_stop(d);
K
Kent Overstreet 已提交
1662
		}
1663
	}
K
Kent Overstreet 已提交
1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688

	mutex_unlock(&bch_register_lock);

	continue_at(cl, cache_set_flush, system_wq);
}

void bch_cache_set_stop(struct cache_set *c)
{
	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
		closure_queue(&c->caching);
}

void bch_cache_set_unregister(struct cache_set *c)
{
	set_bit(CACHE_SET_UNREGISTERING, &c->flags);
	bch_cache_set_stop(c);
}

#define alloc_bucket_pages(gfp, c)			\
	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))

struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
	int iter_size;
	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1689

K
Kent Overstreet 已提交
1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716
	if (!c)
		return NULL;

	__module_get(THIS_MODULE);
	closure_init(&c->cl, NULL);
	set_closure_fn(&c->cl, cache_set_free, system_wq);

	closure_init(&c->caching, &c->cl);
	set_closure_fn(&c->caching, __cache_set_unregister, system_wq);

	/* Maybe create continue_at_noreturn() and use it here? */
	closure_set_stopped(&c->cl);
	closure_put(&c->cl);

	kobject_init(&c->kobj, &bch_cache_set_ktype);
	kobject_init(&c->internal, &bch_cache_set_internal_ktype);

	bch_cache_accounting_init(&c->accounting, &c->cl);

	memcpy(c->sb.set_uuid, sb->set_uuid, 16);
	c->sb.block_size	= sb->block_size;
	c->sb.bucket_size	= sb->bucket_size;
	c->sb.nr_in_set		= sb->nr_in_set;
	c->sb.last_mount	= sb->last_mount;
	c->bucket_bits		= ilog2(sb->bucket_size);
	c->block_bits		= ilog2(sb->block_size);
	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
1717
	c->devices_max_used	= 0;
1718
	atomic_set(&c->attached_dev_nr, 0);
1719
	c->btree_pages		= bucket_pages(c);
K
Kent Overstreet 已提交
1720 1721 1722 1723
	if (c->btree_pages > BTREE_MAX_PAGES)
		c->btree_pages = max_t(int, c->btree_pages / 4,
				       BTREE_MAX_PAGES);

1724
	sema_init(&c->sb_write_mutex, 1);
1725
	mutex_init(&c->bucket_lock);
1726
	init_waitqueue_head(&c->btree_cache_wait);
1727
	init_waitqueue_head(&c->bucket_wait);
1728
	init_waitqueue_head(&c->gc_wait);
1729
	sema_init(&c->uuid_write_mutex, 1);
1730 1731 1732 1733

	spin_lock_init(&c->btree_gc_time.lock);
	spin_lock_init(&c->btree_split_time.lock);
	spin_lock_init(&c->btree_read_time.lock);
1734

K
Kent Overstreet 已提交
1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746
	bch_moving_init_cache_set(c);

	INIT_LIST_HEAD(&c->list);
	INIT_LIST_HEAD(&c->cached_devs);
	INIT_LIST_HEAD(&c->btree_cache);
	INIT_LIST_HEAD(&c->btree_cache_freeable);
	INIT_LIST_HEAD(&c->btree_cache_freed);
	INIT_LIST_HEAD(&c->data_buckets);

	iter_size = (sb->bucket_size / sb->block_size + 1) *
		sizeof(struct btree_iter_set);

K
Kees Cook 已提交
1747
	if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
1748 1749
	    mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
	    mempool_init_kmalloc_pool(&c->bio_meta, 2,
1750 1751
				sizeof(struct bbio) + sizeof(struct bio_vec) *
				bucket_pages(c)) ||
1752 1753 1754
	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
	    bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
K
Kent Overstreet 已提交
1755
	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1756 1757
	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
						WQ_MEM_RECLAIM, 0)) ||
K
Kent Overstreet 已提交
1758 1759
	    bch_journal_alloc(c) ||
	    bch_btree_cache_alloc(c) ||
1760 1761
	    bch_open_buckets_alloc(c) ||
	    bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
K
Kent Overstreet 已提交
1762 1763 1764 1765
		goto err;

	c->congested_read_threshold_us	= 2000;
	c->congested_write_threshold_us	= 20000;
C
Coly Li 已提交
1766
	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
1767
	WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
K
Kent Overstreet 已提交
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779

	return c;
err:
	bch_cache_set_unregister(c);
	return NULL;
}

static void run_cache_set(struct cache_set *c)
{
	const char *err = "cannot allocate memory";
	struct cached_dev *dc, *t;
	struct cache *ca;
K
Kent Overstreet 已提交
1780
	struct closure cl;
1781
	unsigned int i;
K
Kent Overstreet 已提交
1782

K
Kent Overstreet 已提交
1783
	closure_init_stack(&cl);
K
Kent Overstreet 已提交
1784 1785 1786

	for_each_cache(ca, c, i)
		c->nbuckets += ca->sb.nbuckets;
1787
	set_gc_sectors(c);
K
Kent Overstreet 已提交
1788 1789 1790 1791 1792 1793 1794

	if (CACHE_SYNC(&c->sb)) {
		LIST_HEAD(journal);
		struct bkey *k;
		struct jset *j;

		err = "cannot allocate memory for journal";
K
Kent Overstreet 已提交
1795
		if (bch_journal_read(c, &journal))
K
Kent Overstreet 已提交
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818
			goto err;

		pr_debug("btree_journal_read() done");

		err = "no journal entries found";
		if (list_empty(&journal))
			goto err;

		j = &list_entry(journal.prev, struct journal_replay, list)->j;

		err = "IO error reading priorities";
		for_each_cache(ca, c, i)
			prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);

		/*
		 * If prio_read() fails it'll call cache_set_error and we'll
		 * tear everything down right away, but if we perhaps checked
		 * sooner we could avoid journal replay.
		 */

		k = &j->btree_root;

		err = "bad btree root";
1819
		if (__bch_btree_ptr_invalid(c, k))
K
Kent Overstreet 已提交
1820 1821 1822
			goto err;

		err = "error reading btree root";
1823 1824 1825
		c->root = bch_btree_node_get(c, NULL, k,
					     j->btree_level,
					     true, NULL);
K
Kent Overstreet 已提交
1826 1827 1828 1829 1830 1831
		if (IS_ERR_OR_NULL(c->root))
			goto err;

		list_del_init(&c->root->list);
		rw_unlock(true, c->root);

K
Kent Overstreet 已提交
1832
		err = uuid_read(c, j, &cl);
K
Kent Overstreet 已提交
1833 1834 1835 1836
		if (err)
			goto err;

		err = "error in recovery";
K
Kent Overstreet 已提交
1837
		if (bch_btree_check(c))
K
Kent Overstreet 已提交
1838 1839 1840
			goto err;

		bch_journal_mark(c, &journal);
K
Kent Overstreet 已提交
1841
		bch_initial_gc_finish(c);
K
Kent Overstreet 已提交
1842 1843 1844 1845 1846 1847 1848 1849 1850
		pr_debug("btree_check() done");

		/*
		 * bcache_journal_next() can't happen sooner, or
		 * btree_gc_finish() will give spurious errors about last_gc >
		 * gc_gen - this is a hack but oh well.
		 */
		bch_journal_next(&c->journal);

1851
		err = "error starting allocator thread";
K
Kent Overstreet 已提交
1852
		for_each_cache(ca, c, i)
1853 1854
			if (bch_cache_allocator_start(ca))
				goto err;
K
Kent Overstreet 已提交
1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868

		/*
		 * First place it's safe to allocate: btree_check() and
		 * btree_gc_finish() have to run before we have buckets to
		 * allocate, and bch_bucket_alloc_set() might cause a journal
		 * entry to be written so bcache_journal_next() has to be called
		 * first.
		 *
		 * If the uuids were in the old format we have to rewrite them
		 * before the next journal entry is written:
		 */
		if (j->version < BCACHE_JSET_VERSION_UUID)
			__uuid_write(c);

K
Kent Overstreet 已提交
1869
		bch_journal_replay(c, &journal);
K
Kent Overstreet 已提交
1870 1871 1872 1873
	} else {
		pr_notice("invalidating existing data");

		for_each_cache(ca, c, i) {
1874
			unsigned int j;
K
Kent Overstreet 已提交
1875 1876 1877 1878 1879 1880 1881 1882

			ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
					      2, SB_JOURNAL_BUCKETS);

			for (j = 0; j < ca->sb.keys; j++)
				ca->sb.d[j] = ca->sb.first_bucket + j;
		}

K
Kent Overstreet 已提交
1883
		bch_initial_gc_finish(c);
K
Kent Overstreet 已提交
1884

1885
		err = "error starting allocator thread";
K
Kent Overstreet 已提交
1886
		for_each_cache(ca, c, i)
1887 1888
			if (bch_cache_allocator_start(ca))
				goto err;
K
Kent Overstreet 已提交
1889 1890 1891 1892 1893 1894 1895 1896

		mutex_lock(&c->bucket_lock);
		for_each_cache(ca, c, i)
			bch_prio_write(ca);
		mutex_unlock(&c->bucket_lock);

		err = "cannot allocate new UUID bucket";
		if (__uuid_write(c))
K
Kent Overstreet 已提交
1897
			goto err;
K
Kent Overstreet 已提交
1898 1899

		err = "cannot allocate new btree root";
1900
		c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
K
Kent Overstreet 已提交
1901
		if (IS_ERR_OR_NULL(c->root))
K
Kent Overstreet 已提交
1902
			goto err;
K
Kent Overstreet 已提交
1903

K
Kent Overstreet 已提交
1904
		mutex_lock(&c->root->write_lock);
K
Kent Overstreet 已提交
1905
		bkey_copy_key(&c->root->key, &MAX_KEY);
K
Kent Overstreet 已提交
1906
		bch_btree_node_write(c->root, &cl);
K
Kent Overstreet 已提交
1907
		mutex_unlock(&c->root->write_lock);
K
Kent Overstreet 已提交
1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919

		bch_btree_set_root(c->root);
		rw_unlock(true, c->root);

		/*
		 * We don't want to write the first journal entry until
		 * everything is set up - fortunately journal entries won't be
		 * written until the SET_CACHE_SYNC() here:
		 */
		SET_CACHE_SYNC(&c->sb, true);

		bch_journal_next(&c->journal);
K
Kent Overstreet 已提交
1920
		bch_journal_meta(c, &cl);
K
Kent Overstreet 已提交
1921 1922
	}

K
Kent Overstreet 已提交
1923 1924 1925 1926
	err = "error starting gc thread";
	if (bch_gc_thread_start(c))
		goto err;

K
Kent Overstreet 已提交
1927
	closure_sync(&cl);
1928
	c->sb.last_mount = (u32)ktime_get_real_seconds();
K
Kent Overstreet 已提交
1929 1930 1931
	bcache_write_super(c);

	list_for_each_entry_safe(dc, t, &uncached_devices, list)
1932
		bch_cached_dev_attach(dc, c, NULL);
K
Kent Overstreet 已提交
1933 1934 1935

	flash_devs_run(c);

1936
	set_bit(CACHE_SET_RUNNING, &c->flags);
K
Kent Overstreet 已提交
1937 1938
	return;
err:
K
Kent Overstreet 已提交
1939
	closure_sync(&cl);
K
Kent Overstreet 已提交
1940
	/* XXX: test this, it's broken */
1941
	bch_cache_set_error(c, "%s", err);
K
Kent Overstreet 已提交
1942 1943 1944 1945 1946
}

static bool can_attach_cache(struct cache *ca, struct cache_set *c)
{
	return ca->sb.block_size	== c->sb.block_size &&
1947
		ca->sb.bucket_size	== c->sb.bucket_size &&
K
Kent Overstreet 已提交
1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
		ca->sb.nr_in_set	== c->sb.nr_in_set;
}

static const char *register_cache_set(struct cache *ca)
{
	char buf[12];
	const char *err = "cannot allocate memory";
	struct cache_set *c;

	list_for_each_entry(c, &bch_cache_sets, list)
		if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
			if (c->cache[ca->sb.nr_this_dev])
				return "duplicate cache set member";

			if (!can_attach_cache(ca, c))
				return "cache sb does not match set";

			if (!CACHE_SYNC(&ca->sb))
				SET_CACHE_SYNC(&c->sb, false);

			goto found;
		}

	c = bch_cache_set_alloc(&ca->sb);
	if (!c)
		return err;

	err = "error creating kobject";
	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
	    kobject_add(&c->internal, &c->kobj, "internal"))
		goto err;

	if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
		goto err;

	bch_debug_init_cache_set(c);

	list_add(&c->list, &bch_cache_sets);
found:
	sprintf(buf, "cache%i", ca->sb.nr_this_dev);
	if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
	    sysfs_create_link(&c->kobj, &ca->kobj, buf))
		goto err;

	if (ca->sb.seq > c->sb.seq) {
		c->sb.version		= ca->sb.version;
		memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
		c->sb.flags             = ca->sb.flags;
		c->sb.seq		= ca->sb.seq;
		pr_debug("set version = %llu", c->sb.version);
	}

2000
	kobject_get(&ca->kobj);
K
Kent Overstreet 已提交
2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
	ca->set = c;
	ca->set->cache[ca->sb.nr_this_dev] = ca;
	c->cache_by_alloc[c->caches_loaded++] = ca;

	if (c->caches_loaded == c->sb.nr_in_set)
		run_cache_set(c);

	return NULL;
err:
	bch_cache_set_unregister(c);
	return err;
}

/* Cache device */

void bch_cache_release(struct kobject *kobj)
{
	struct cache *ca = container_of(kobj, struct cache, kobj);
2019
	unsigned int i;
K
Kent Overstreet 已提交
2020

2021 2022
	if (ca->set) {
		BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
K
Kent Overstreet 已提交
2023
		ca->set->cache[ca->sb.nr_this_dev] = NULL;
2024
	}
K
Kent Overstreet 已提交
2025 2026 2027 2028 2029 2030 2031

	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
	kfree(ca->prio_buckets);
	vfree(ca->buckets);

	free_heap(&ca->heap);
	free_fifo(&ca->free_inc);
2032 2033 2034

	for (i = 0; i < RESERVE_NR; i++)
		free_fifo(&ca->free[i]);
K
Kent Overstreet 已提交
2035 2036

	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
2037
		put_page(bio_first_page_all(&ca->sb_bio));
K
Kent Overstreet 已提交
2038

2039
	if (!IS_ERR_OR_NULL(ca->bdev))
K
Kent Overstreet 已提交
2040 2041 2042 2043 2044 2045
		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);

	kfree(ca);
	module_put(THIS_MODULE);
}

2046
static int cache_alloc(struct cache *ca)
K
Kent Overstreet 已提交
2047 2048
{
	size_t free;
2049
	size_t btree_buckets;
K
Kent Overstreet 已提交
2050 2051 2052 2053 2054
	struct bucket *b;

	__module_get(THIS_MODULE);
	kobject_init(&ca->kobj, &bch_cache_ktype);

2055
	bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
K
Kent Overstreet 已提交
2056

2057 2058 2059 2060 2061 2062 2063 2064 2065 2066
	/*
	 * when ca->sb.njournal_buckets is not zero, journal exists,
	 * and in bch_journal_replay(), tree node may split,
	 * so bucket of RESERVE_BTREE type is needed,
	 * the worst situation is all journal buckets are valid journal,
	 * and all the keys need to replay,
	 * so the number of  RESERVE_BTREE type buckets should be as much
	 * as journal buckets
	 */
	btree_buckets = ca->sb.njournal_buckets ?: 8;
2067
	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
K
Kent Overstreet 已提交
2068

2069
	if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
2070
	    !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
2071 2072
	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
K
Kent Overstreet 已提交
2073 2074
	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
2075 2076
	    !(ca->buckets	= vzalloc(array_size(sizeof(struct bucket),
						     ca->sb.nbuckets))) ||
K
Kees Cook 已提交
2077 2078 2079
	    !(ca->prio_buckets	= kzalloc(array3_size(sizeof(uint64_t),
						      prio_buckets(ca), 2),
					  GFP_KERNEL)) ||
2080
	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)))
2081
		return -ENOMEM;
K
Kent Overstreet 已提交
2082 2083 2084 2085 2086 2087 2088 2089 2090

	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);

	for_each_bucket(b, ca)
		atomic_set(&b->pin, 0);

	return 0;
}

2091
static int register_cache(struct cache_sb *sb, struct page *sb_page,
2092
				struct block_device *bdev, struct cache *ca)
K
Kent Overstreet 已提交
2093
{
2094
	const char *err = NULL; /* must be set for any error case */
2095
	int ret = 0;
K
Kent Overstreet 已提交
2096

2097
	bdevname(bdev, ca->cache_dev_name);
2098
	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
K
Kent Overstreet 已提交
2099 2100 2101
	ca->bdev = bdev;
	ca->bdev->bd_holder = ca;

2102
	bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
2103
	bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
2104 2105
	get_page(sb_page);

2106
	if (blk_queue_discard(bdev_get_queue(bdev)))
K
Kent Overstreet 已提交
2107 2108
		ca->discard = CACHE_DISCARD(&ca->sb);

2109
	ret = cache_alloc(ca);
2110
	if (ret != 0) {
2111
		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2112 2113 2114 2115
		if (ret == -ENOMEM)
			err = "cache_alloc(): -ENOMEM";
		else
			err = "cache_alloc(): unknown error";
2116
		goto err;
2117
	}
2118

2119 2120 2121
	if (kobject_add(&ca->kobj,
			&part_to_dev(bdev->bd_part)->kobj,
			"bcache")) {
2122 2123 2124 2125
		err = "error calling kobject_add";
		ret = -ENOMEM;
		goto out;
	}
K
Kent Overstreet 已提交
2126

2127
	mutex_lock(&bch_register_lock);
K
Kent Overstreet 已提交
2128
	err = register_cache_set(ca);
2129 2130
	mutex_unlock(&bch_register_lock);

2131 2132 2133 2134
	if (err) {
		ret = -ENODEV;
		goto out;
	}
K
Kent Overstreet 已提交
2135

2136
	pr_info("registered cache device %s", ca->cache_dev_name);
2137

2138 2139
out:
	kobject_put(&ca->kobj);
2140

K
Kent Overstreet 已提交
2141
err:
2142
	if (err)
2143
		pr_notice("error %s: %s", ca->cache_dev_name, err);
2144 2145

	return ret;
K
Kent Overstreet 已提交
2146 2147 2148 2149
}

/* Global interfaces/init */

2150 2151
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
			       const char *buffer, size_t size);
K
Kent Overstreet 已提交
2152 2153 2154 2155

kobj_attribute_write(register,		register_bcache);
kobj_attribute_write(register_quiet,	register_bcache);

2156 2157
static bool bch_is_open_backing(struct block_device *bdev)
{
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170
	struct cache_set *c, *tc;
	struct cached_dev *dc, *t;

	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
			if (dc->bdev == bdev)
				return true;
	list_for_each_entry_safe(dc, t, &uncached_devices, list)
		if (dc->bdev == bdev)
			return true;
	return false;
}

2171 2172
static bool bch_is_open_cache(struct block_device *bdev)
{
2173 2174
	struct cache_set *c, *tc;
	struct cache *ca;
2175
	unsigned int i;
2176 2177 2178 2179 2180 2181 2182 2183

	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
		for_each_cache(ca, c, i)
			if (ca->bdev == bdev)
				return true;
	return false;
}

2184 2185
static bool bch_is_open(struct block_device *bdev)
{
2186 2187 2188
	return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
}

K
Kent Overstreet 已提交
2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
			       const char *buffer, size_t size)
{
	ssize_t ret = size;
	const char *err = "cannot allocate memory";
	char *path = NULL;
	struct cache_sb *sb = NULL;
	struct block_device *bdev = NULL;
	struct page *sb_page = NULL;

	if (!try_module_get(THIS_MODULE))
		return -EBUSY;

2202 2203 2204 2205 2206 2207
	path = kstrndup(buffer, size, GFP_KERNEL);
	if (!path)
		goto err;

	sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
	if (!sb)
K
Kent Overstreet 已提交
2208 2209 2210 2211 2212 2213
		goto err;

	err = "failed to open device";
	bdev = blkdev_get_by_path(strim(path),
				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
				  sb);
2214
	if (IS_ERR(bdev)) {
2215 2216
		if (bdev == ERR_PTR(-EBUSY)) {
			bdev = lookup_bdev(strim(path));
2217
			mutex_lock(&bch_register_lock);
2218 2219 2220 2221
			if (!IS_ERR(bdev) && bch_is_open(bdev))
				err = "device already registered";
			else
				err = "device busy";
2222
			mutex_unlock(&bch_register_lock);
J
Jan Kara 已提交
2223 2224
			if (!IS_ERR(bdev))
				bdput(bdev);
2225 2226
			if (attr == &ksysfs_register_quiet)
				goto out;
2227
		}
K
Kent Overstreet 已提交
2228
		goto err;
2229 2230 2231 2232 2233
	}

	err = "failed to set blocksize";
	if (set_blocksize(bdev, 4096))
		goto err_close;
K
Kent Overstreet 已提交
2234 2235 2236 2237 2238

	err = read_super(sb, bdev, &sb_page);
	if (err)
		goto err_close;

2239
	err = "failed to register device";
2240
	if (SB_IS_BDEV(sb)) {
K
Kent Overstreet 已提交
2241
		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2242

2243 2244
		if (!dc)
			goto err_close;
K
Kent Overstreet 已提交
2245

2246
		mutex_lock(&bch_register_lock);
2247
		register_bdev(sb, sb_page, bdev, dc);
2248
		mutex_unlock(&bch_register_lock);
K
Kent Overstreet 已提交
2249 2250
	} else {
		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2251

2252 2253
		if (!ca)
			goto err_close;
K
Kent Overstreet 已提交
2254

2255
		if (register_cache(sb, sb_page, bdev, ca) != 0)
2256
			goto err;
K
Kent Overstreet 已提交
2257
	}
2258 2259
out:
	if (sb_page)
K
Kent Overstreet 已提交
2260 2261 2262 2263 2264
		put_page(sb_page);
	kfree(sb);
	kfree(path);
	module_put(THIS_MODULE);
	return ret;
2265 2266 2267 2268

err_close:
	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err:
2269
	pr_info("error %s: %s", path, err);
2270 2271
	ret = -EINVAL;
	goto out;
K
Kent Overstreet 已提交
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343
}

static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
{
	if (code == SYS_DOWN ||
	    code == SYS_HALT ||
	    code == SYS_POWER_OFF) {
		DEFINE_WAIT(wait);
		unsigned long start = jiffies;
		bool stopped = false;

		struct cache_set *c, *tc;
		struct cached_dev *dc, *tdc;

		mutex_lock(&bch_register_lock);

		if (list_empty(&bch_cache_sets) &&
		    list_empty(&uncached_devices))
			goto out;

		pr_info("Stopping all devices:");

		list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
			bch_cache_set_stop(c);

		list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
			bcache_device_stop(&dc->disk);

		/* What's a condition variable? */
		while (1) {
			long timeout = start + 2 * HZ - jiffies;

			stopped = list_empty(&bch_cache_sets) &&
				list_empty(&uncached_devices);

			if (timeout < 0 || stopped)
				break;

			prepare_to_wait(&unregister_wait, &wait,
					TASK_UNINTERRUPTIBLE);

			mutex_unlock(&bch_register_lock);
			schedule_timeout(timeout);
			mutex_lock(&bch_register_lock);
		}

		finish_wait(&unregister_wait, &wait);

		if (stopped)
			pr_info("All devices stopped");
		else
			pr_notice("Timeout waiting for devices to be closed");
out:
		mutex_unlock(&bch_register_lock);
	}

	return NOTIFY_DONE;
}

static struct notifier_block reboot = {
	.notifier_call	= bcache_reboot,
	.priority	= INT_MAX, /* before any real devices */
};

static void bcache_exit(void)
{
	bch_debug_exit();
	bch_request_exit();
	if (bcache_kobj)
		kobject_put(bcache_kobj);
	if (bcache_wq)
		destroy_workqueue(bcache_wq);
2344 2345
	if (bcache_major)
		unregister_blkdev(bcache_major, "bcache");
K
Kent Overstreet 已提交
2346
	unregister_reboot_notifier(&reboot);
2347
	mutex_destroy(&bch_register_lock);
K
Kent Overstreet 已提交
2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
}

static int __init bcache_init(void)
{
	static const struct attribute *files[] = {
		&ksysfs_register.attr,
		&ksysfs_register_quiet.attr,
		NULL
	};

	mutex_init(&bch_register_lock);
	init_waitqueue_head(&unregister_wait);
	register_reboot_notifier(&reboot);

	bcache_major = register_blkdev(0, "bcache");
2363 2364
	if (bcache_major < 0) {
		unregister_reboot_notifier(&reboot);
2365
		mutex_destroy(&bch_register_lock);
K
Kent Overstreet 已提交
2366
		return bcache_major;
2367
	}
K
Kent Overstreet 已提交
2368

2369 2370 2371 2372 2373 2374 2375 2376 2377
	bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
	if (!bcache_wq)
		goto err;

	bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
	if (!bcache_kobj)
		goto err;

	if (bch_request_init() ||
2378
	    sysfs_create_files(bcache_kobj, files))
K
Kent Overstreet 已提交
2379 2380
		goto err;

2381 2382 2383
	bch_debug_init(bcache_kobj);
	closure_debug_init();

K
Kent Overstreet 已提交
2384 2385 2386 2387 2388 2389 2390 2391
	return 0;
err:
	bcache_exit();
	return -ENOMEM;
}

module_exit(bcache_exit);
module_init(bcache_init);