writeback.c 12.5 KB
Newer Older
K
Kent Overstreet 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 * background writeback - scan btree for dirty data and write it to the backing
 * device
 *
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
12
#include "writeback.h"
K
Kent Overstreet 已提交
13

14 15
#include <linux/delay.h>
#include <linux/kthread.h>
K
Kent Overstreet 已提交
16 17
#include <trace/events/bcache.h>

K
Kent Overstreet 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30 31
/* Rate limiting */

static void __update_writeback_rate(struct cached_dev *dc)
{
	struct cache_set *c = dc->disk.c;
	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
	uint64_t cache_dirty_target =
		div_u64(cache_sectors * dc->writeback_percent, 100);

	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
				   c->cached_dev_sectors);

	/* PD controller */

32
	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
K
Kent Overstreet 已提交
33
	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
34 35
	int64_t proportional = dirty - target;
	int64_t change;
K
Kent Overstreet 已提交
36 37 38

	dc->disk.sectors_dirty_last = dirty;

39
	/* Scale to sectors per second */
K
Kent Overstreet 已提交
40

41 42
	proportional *= dc->writeback_rate_update_seconds;
	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
K
Kent Overstreet 已提交
43

44
	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
K
Kent Overstreet 已提交
45

46 47 48 49 50 51
	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
			      (dc->writeback_rate_d_term /
			       dc->writeback_rate_update_seconds) ?: 1, 0);

	derivative *= dc->writeback_rate_d_term;
	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
K
Kent Overstreet 已提交
52

53
	change = proportional + derivative;
K
Kent Overstreet 已提交
54 55 56 57

	/* Don't increase writeback rate if the device isn't keeping up */
	if (change > 0 &&
	    time_after64(local_clock(),
58
			 dc->writeback_rate.next + NSEC_PER_MSEC))
K
Kent Overstreet 已提交
59 60 61
		change = 0;

	dc->writeback_rate.rate =
62
		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
K
Kent Overstreet 已提交
63
			1, NSEC_PER_MSEC);
64 65

	dc->writeback_rate_proportional = proportional;
K
Kent Overstreet 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
	dc->writeback_rate_derivative = derivative;
	dc->writeback_rate_change = change;
	dc->writeback_rate_target = target;
}

static void update_writeback_rate(struct work_struct *work)
{
	struct cached_dev *dc = container_of(to_delayed_work(work),
					     struct cached_dev,
					     writeback_rate_update);

	down_read(&dc->writeback_lock);

	if (atomic_read(&dc->has_dirty) &&
	    dc->writeback_percent)
		__update_writeback_rate(dc);

	up_read(&dc->writeback_lock);
84 85 86

	schedule_delayed_work(&dc->writeback_rate_update,
			      dc->writeback_rate_update_seconds * HZ);
K
Kent Overstreet 已提交
87 88 89 90
}

static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
91
	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
K
Kent Overstreet 已提交
92 93 94
	    !dc->writeback_percent)
		return 0;

95
	return bch_next_delay(&dc->writeback_rate, sectors);
K
Kent Overstreet 已提交
96 97
}

98 99 100 101 102
struct dirty_io {
	struct closure		cl;
	struct cached_dev	*dc;
	struct bio		bio;
};
K
Kent Overstreet 已提交
103

K
Kent Overstreet 已提交
104 105 106 107 108 109 110 111 112
static void dirty_init(struct keybuf_key *w)
{
	struct dirty_io *io = w->private;
	struct bio *bio = &io->bio;

	bio_init(bio);
	if (!io->dc->writeback_percent)
		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));

113
	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
K
Kent Overstreet 已提交
114 115 116
	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
	bio->bi_private		= w;
	bio->bi_io_vec		= bio->bi_inline_vecs;
117
	bch_bio_map(bio, NULL);
K
Kent Overstreet 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130
}

static void dirty_io_destructor(struct closure *cl)
{
	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
	kfree(io);
}

static void write_dirty_finish(struct closure *cl)
{
	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
	struct keybuf_key *w = io->bio.bi_private;
	struct cached_dev *dc = io->dc;
131 132
	struct bio_vec *bv;
	int i;
K
Kent Overstreet 已提交
133

134
	bio_for_each_segment_all(bv, &io->bio, i)
K
Kent Overstreet 已提交
135 136 137 138
		__free_page(bv->bv_page);

	/* This is kind of a dumb way of signalling errors. */
	if (KEY_DIRTY(&w->key)) {
139
		int ret;
K
Kent Overstreet 已提交
140
		unsigned i;
141 142 143
		struct keylist keys;

		bch_keylist_init(&keys);
K
Kent Overstreet 已提交
144

K
Kent Overstreet 已提交
145 146 147
		bkey_copy(keys.top, &w->key);
		SET_KEY_DIRTY(keys.top, false);
		bch_keylist_push(&keys);
K
Kent Overstreet 已提交
148 149 150 151

		for (i = 0; i < KEY_PTRS(&w->key); i++)
			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);

152
		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
K
Kent Overstreet 已提交
153

154
		if (ret)
K
Kent Overstreet 已提交
155 156
			trace_bcache_writeback_collision(&w->key);

157
		atomic_long_inc(ret
K
Kent Overstreet 已提交
158 159 160 161 162
				? &dc->disk.c->writeback_keys_failed
				: &dc->disk.c->writeback_keys_done);
	}

	bch_keybuf_del(&dc->writeback_keys, w);
163
	up(&dc->in_flight);
K
Kent Overstreet 已提交
164 165 166 167

	closure_return_with_destructor(cl, dirty_io_destructor);
}

168
static void dirty_endio(struct bio *bio)
K
Kent Overstreet 已提交
169 170 171 172
{
	struct keybuf_key *w = bio->bi_private;
	struct dirty_io *io = w->private;

173
	if (bio->bi_error)
K
Kent Overstreet 已提交
174 175 176 177 178 179 180 181 182 183 184
		SET_KEY_DIRTY(&w->key, false);

	closure_put(&io->cl);
}

static void write_dirty(struct closure *cl)
{
	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
	struct keybuf_key *w = io->bio.bi_private;

	dirty_init(w);
M
Mike Christie 已提交
185
	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
186
	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
K
Kent Overstreet 已提交
187 188 189
	io->bio.bi_bdev		= io->dc->bdev;
	io->bio.bi_end_io	= dirty_endio;

190
	closure_bio_submit(&io->bio, cl);
K
Kent Overstreet 已提交
191

192
	continue_at(cl, write_dirty_finish, system_wq);
K
Kent Overstreet 已提交
193 194
}

195
static void read_dirty_endio(struct bio *bio)
K
Kent Overstreet 已提交
196 197 198 199 200
{
	struct keybuf_key *w = bio->bi_private;
	struct dirty_io *io = w->private;

	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
201
			    bio->bi_error, "reading dirty data from cache");
K
Kent Overstreet 已提交
202

203
	dirty_endio(bio);
K
Kent Overstreet 已提交
204 205 206 207 208 209
}

static void read_dirty_submit(struct closure *cl)
{
	struct dirty_io *io = container_of(cl, struct dirty_io, cl);

210
	closure_bio_submit(&io->bio, cl);
K
Kent Overstreet 已提交
211

212
	continue_at(cl, write_dirty, system_wq);
K
Kent Overstreet 已提交
213 214
}

215
static void read_dirty(struct cached_dev *dc)
K
Kent Overstreet 已提交
216
{
217
	unsigned delay = 0;
K
Kent Overstreet 已提交
218 219
	struct keybuf_key *w;
	struct dirty_io *io;
220 221 222
	struct closure cl;

	closure_init_stack(&cl);
K
Kent Overstreet 已提交
223 224 225 226 227 228

	/*
	 * XXX: if we error, background writeback just spins. Should use some
	 * mempools.
	 */

229 230
	while (!kthread_should_stop()) {

K
Kent Overstreet 已提交
231 232 233 234 235 236
		w = bch_keybuf_next(&dc->writeback_keys);
		if (!w)
			break;

		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));

237 238 239
		if (KEY_START(&w->key) != dc->last_read ||
		    jiffies_to_msecs(delay) > 50)
			while (!kthread_should_stop() && delay)
240
				delay = schedule_timeout_interruptible(delay);
K
Kent Overstreet 已提交
241 242 243 244 245 246 247 248 249 250 251 252 253

		dc->last_read	= KEY_OFFSET(&w->key);

		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
			     GFP_KERNEL);
		if (!io)
			goto err;

		w->private	= io;
		io->dc		= dc;

		dirty_init(w);
M
Mike Christie 已提交
254
		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
255
		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
K
Kent Overstreet 已提交
256 257 258 259
		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
						    &w->key, 0)->bdev;
		io->bio.bi_end_io	= read_dirty_endio;

260
		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
K
Kent Overstreet 已提交
261 262
			goto err_free;

K
Kent Overstreet 已提交
263
		trace_bcache_writeback(&w->key);
K
Kent Overstreet 已提交
264

265
		down(&dc->in_flight);
266
		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
K
Kent Overstreet 已提交
267 268 269 270 271 272 273 274 275 276 277

		delay = writeback_delay(dc, KEY_SIZE(&w->key));
	}

	if (0) {
err_free:
		kfree(w->private);
err:
		bch_keybuf_del(&dc->writeback_keys, w);
	}

278 279 280 281
	/*
	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
	 * freed) before refilling again
	 */
282 283 284 285 286 287 288 289 290
	closure_sync(&cl);
}

/* Scan for dirty data */

void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
				  uint64_t offset, int nr_sectors)
{
	struct bcache_device *d = c->devices[inode];
291
	unsigned stripe_offset, stripe, sectors_dirty;
292 293 294 295

	if (!d)
		return;

296
	stripe = offset_to_stripe(d, offset);
297 298 299 300 301 302 303 304 305
	stripe_offset = offset & (d->stripe_size - 1);

	while (nr_sectors) {
		int s = min_t(unsigned, abs(nr_sectors),
			      d->stripe_size - stripe_offset);

		if (nr_sectors < 0)
			s = -s;

306 307 308 309 310 311 312 313 314 315
		if (stripe >= d->nr_stripes)
			return;

		sectors_dirty = atomic_add_return(s,
					d->stripe_sectors_dirty + stripe);
		if (sectors_dirty == d->stripe_size)
			set_bit(stripe, d->full_dirty_stripes);
		else
			clear_bit(stripe, d->full_dirty_stripes);

316 317 318 319 320 321 322 323
		nr_sectors -= s;
		stripe_offset = 0;
		stripe++;
	}
}

static bool dirty_pred(struct keybuf *buf, struct bkey *k)
{
324 325 326 327
	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);

	BUG_ON(KEY_INODE(k) != dc->disk.id);

328 329 330
	return KEY_DIRTY(k);
}

331
static void refill_full_stripes(struct cached_dev *dc)
332
{
333 334 335 336 337
	struct keybuf *buf = &dc->writeback_keys;
	unsigned start_stripe, stripe, next_stripe;
	bool wrapped = false;

	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
338

339 340
	if (stripe >= dc->disk.nr_stripes)
		stripe = 0;
341

342
	start_stripe = stripe;
343 344

	while (1) {
345 346
		stripe = find_next_bit(dc->disk.full_dirty_stripes,
				       dc->disk.nr_stripes, stripe);
347

348 349
		if (stripe == dc->disk.nr_stripes)
			goto next;
350

351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
						 dc->disk.nr_stripes, stripe);

		buf->last_scanned = KEY(dc->disk.id,
					stripe * dc->disk.stripe_size, 0);

		bch_refill_keybuf(dc->disk.c, buf,
				  &KEY(dc->disk.id,
				       next_stripe * dc->disk.stripe_size, 0),
				  dirty_pred);

		if (array_freelist_empty(&buf->freelist))
			return;

		stripe = next_stripe;
next:
		if (wrapped && stripe > start_stripe)
			return;

		if (stripe == dc->disk.nr_stripes) {
			stripe = 0;
			wrapped = true;
		}
374 375 376
	}
}

377 378 379
/*
 * Returns true if we scanned the entire disk
 */
380 381 382
static bool refill_dirty(struct cached_dev *dc)
{
	struct keybuf *buf = &dc->writeback_keys;
383
	struct bkey start = KEY(dc->disk.id, 0, 0);
384
	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
385 386 387 388 389 390 391 392 393 394
	struct bkey start_pos;

	/*
	 * make sure keybuf pos is inside the range for this disk - at bringup
	 * we might not be attached yet so this disk's inode nr isn't
	 * initialized then
	 */
	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
	    bkey_cmp(&buf->last_scanned, &end) > 0)
		buf->last_scanned = start;
395 396 397 398 399 400

	if (dc->partial_stripes_expensive) {
		refill_full_stripes(dc);
		if (array_freelist_empty(&buf->freelist))
			return false;
	}
401

402
	start_pos = buf->last_scanned;
403
	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
404

405 406 407 408 409 410 411 412 413 414 415
	if (bkey_cmp(&buf->last_scanned, &end) < 0)
		return false;

	/*
	 * If we get to the end start scanning again from the beginning, and
	 * only scan up to where we initially started scanning from:
	 */
	buf->last_scanned = start;
	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);

	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
416 417 418 419 420 421 422 423 424 425
}

static int bch_writeback_thread(void *arg)
{
	struct cached_dev *dc = arg;
	bool searched_full_index;

	while (!kthread_should_stop()) {
		down_write(&dc->writeback_lock);
		if (!atomic_read(&dc->has_dirty) ||
426
		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
		     !dc->writeback_running)) {
			up_write(&dc->writeback_lock);
			set_current_state(TASK_INTERRUPTIBLE);

			if (kthread_should_stop())
				return 0;

			schedule();
			continue;
		}

		searched_full_index = refill_dirty(dc);

		if (searched_full_index &&
		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
			atomic_set(&dc->has_dirty, 0);
			cached_dev_put(dc);
			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
			bch_write_bdev_super(dc, NULL);
		}

		up_write(&dc->writeback_lock);

		bch_ratelimit_reset(&dc->writeback_rate);
		read_dirty(dc);

		if (searched_full_index) {
			unsigned delay = dc->writeback_delay * HZ;

			while (delay &&
			       !kthread_should_stop() &&
458
			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
459
				delay = schedule_timeout_interruptible(delay);
460 461 462 463
		}
	}

	return 0;
K
Kent Overstreet 已提交
464 465
}

466 467
/* Init */

K
Kent Overstreet 已提交
468 469 470 471 472 473
struct sectors_dirty_init {
	struct btree_op	op;
	unsigned	inode;
};

static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
474
				 struct bkey *k)
475
{
K
Kent Overstreet 已提交
476 477
	struct sectors_dirty_init *op = container_of(_op,
						struct sectors_dirty_init, op);
478 479
	if (KEY_INODE(k) > op->inode)
		return MAP_DONE;
480

481 482 483 484 485
	if (KEY_DIRTY(k))
		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
					     KEY_START(k), KEY_SIZE(k));

	return MAP_CONTINUE;
486 487 488 489
}

void bch_sectors_dirty_init(struct cached_dev *dc)
{
K
Kent Overstreet 已提交
490
	struct sectors_dirty_init op;
491

K
Kent Overstreet 已提交
492
	bch_btree_op_init(&op.op, -1);
493 494
	op.inode = dc->disk.id;

K
Kent Overstreet 已提交
495
	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
496
			   sectors_dirty_init_fn, 0);
497 498

	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
499 500
}

501
void bch_cached_dev_writeback_init(struct cached_dev *dc)
K
Kent Overstreet 已提交
502
{
503
	sema_init(&dc->in_flight, 64);
K
Kent Overstreet 已提交
504
	init_rwsem(&dc->writeback_lock);
K
Kent Overstreet 已提交
505
	bch_keybuf_init(&dc->writeback_keys);
K
Kent Overstreet 已提交
506 507 508 509 510 511 512

	dc->writeback_metadata		= true;
	dc->writeback_running		= true;
	dc->writeback_percent		= 10;
	dc->writeback_delay		= 30;
	dc->writeback_rate.rate		= 1024;

513 514 515
	dc->writeback_rate_update_seconds = 5;
	dc->writeback_rate_d_term	= 30;
	dc->writeback_rate_p_term_inverse = 6000;
K
Kent Overstreet 已提交
516

517 518 519 520 521
	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}

int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
522 523 524 525 526
	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
					      "bcache_writeback");
	if (IS_ERR(dc->writeback_thread))
		return PTR_ERR(dc->writeback_thread);

K
Kent Overstreet 已提交
527 528 529
	schedule_delayed_work(&dc->writeback_rate_update,
			      dc->writeback_rate_update_seconds * HZ);

530 531
	bch_writeback_queue(dc);

K
Kent Overstreet 已提交
532 533
	return 0;
}