backing-dev.c 15.6 KB
Newer Older
1 2 3

#include <linux/wait.h>
#include <linux/backing-dev.h>
4 5
#include <linux/kthread.h>
#include <linux/freezer.h>
6
#include <linux/fs.h>
7
#include <linux/pagemap.h>
8
#include <linux/mm.h>
9 10
#include <linux/sched.h>
#include <linux/module.h>
11 12
#include <linux/writeback.h>
#include <linux/device.h>
13
#include <trace/events/writeback.h>
14

15 16
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

J
Jörn Engel 已提交
17 18
struct backing_dev_info noop_backing_dev_info = {
	.name		= "noop",
19
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
J
Jörn Engel 已提交
20 21
};

22
static struct class *bdi_class;
23 24

/*
25
 * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
26 27
 * locking.
 */
28
DEFINE_SPINLOCK(bdi_lock);
29
LIST_HEAD(bdi_list);
30

31 32 33
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

34 35 36 37 38 39 40 41 42 43 44 45 46 47
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
	bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
	struct backing_dev_info *bdi = m->private;
C
Christoph Hellwig 已提交
48
	struct bdi_writeback *wb = &bdi->wb;
49 50 51
	unsigned long background_thresh;
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
52
	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
53 54
	struct inode *inode;

55
	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
56
	spin_lock(&wb->list_lock);
N
Nick Piggin 已提交
57
	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
C
Christoph Hellwig 已提交
58
		nr_dirty++;
N
Nick Piggin 已提交
59
	list_for_each_entry(inode, &wb->b_io, i_wb_list)
C
Christoph Hellwig 已提交
60
		nr_io++;
N
Nick Piggin 已提交
61
	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
C
Christoph Hellwig 已提交
62
		nr_more_io++;
63 64 65
	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
		if (inode->i_state & I_DIRTY_TIME)
			nr_dirty_time++;
66
	spin_unlock(&wb->list_lock);
67

68 69
	global_dirty_limits(&background_thresh, &dirty_thresh);
	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
70 71 72

#define K(x) ((x) << (PAGE_SHIFT - 10))
	seq_printf(m,
73 74 75 76 77
		   "BdiWriteback:       %10lu kB\n"
		   "BdiReclaimable:     %10lu kB\n"
		   "BdiDirtyThresh:     %10lu kB\n"
		   "DirtyThresh:        %10lu kB\n"
		   "BackgroundThresh:   %10lu kB\n"
78
		   "BdiDirtied:         %10lu kB\n"
79 80 81 82 83
		   "BdiWritten:         %10lu kB\n"
		   "BdiWriteBandwidth:  %10lu kBps\n"
		   "b_dirty:            %10lu\n"
		   "b_io:               %10lu\n"
		   "b_more_io:          %10lu\n"
84
		   "b_dirty_time:       %10lu\n"
85 86
		   "bdi_list:           %10u\n"
		   "state:              %10lx\n",
87 88
		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
89 90 91
		   K(bdi_thresh),
		   K(dirty_thresh),
		   K(background_thresh),
92 93
		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
94
		   (unsigned long) K(bdi->write_bandwidth),
95 96 97
		   nr_dirty,
		   nr_io,
		   nr_more_io,
98
		   nr_dirty_time,
99
		   !list_empty(&bdi->bdi_list), bdi->wb.state);
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
#undef K

	return 0;
}

static int bdi_debug_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, bdi_debug_stats_show, inode->i_private);
}

static const struct file_operations bdi_debug_stats_fops = {
	.open		= bdi_debug_stats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
					       bdi, &bdi_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
	debugfs_remove(bdi->debug_stats);
	debugfs_remove(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
				      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif

142 143 144 145 146 147
static ssize_t read_ahead_kb_store(struct device *dev,
				  struct device_attribute *attr,
				  const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned long read_ahead_kb;
148
	ssize_t ret;
149

150 151 152 153 154 155 156
	ret = kstrtoul(buf, 10, &read_ahead_kb);
	if (ret < 0)
		return ret;

	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

	return count;
157 158 159 160 161 162 163 164 165 166 167
}

#define K(pages) ((pages) << (PAGE_SHIFT - 10))

#define BDI_SHOW(name, expr)						\
static ssize_t name##_show(struct device *dev,				\
			   struct device_attribute *attr, char *page)	\
{									\
	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
									\
	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
168 169
}									\
static DEVICE_ATTR_RW(name);
170 171 172

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

173 174 175 176 177
static ssize_t min_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
178 179 180 181 182 183 184 185 186
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_min_ratio(bdi, ratio);
	if (!ret)
		ret = count;
187 188 189 190 191

	return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)

192 193 194 195 196
static ssize_t max_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
197 198 199 200 201 202 203 204 205
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_max_ratio(bdi, ratio);
	if (!ret)
		ret = count;
206 207 208 209 210

	return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)

211 212 213 214 215 216 217 218 219
static ssize_t stable_pages_required_show(struct device *dev,
					  struct device_attribute *attr,
					  char *page)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);

	return snprintf(page, PAGE_SIZE-1, "%d\n",
			bdi_cap_stable_pages_required(bdi) ? 1 : 0);
}
220 221 222 223 224 225 226 227
static DEVICE_ATTR_RO(stable_pages_required);

static struct attribute *bdi_dev_attrs[] = {
	&dev_attr_read_ahead_kb.attr,
	&dev_attr_min_ratio.attr,
	&dev_attr_max_ratio.attr,
	&dev_attr_stable_pages_required.attr,
	NULL,
228
};
229
ATTRIBUTE_GROUPS(bdi_dev);
230 231 232 233

static __init int bdi_class_init(void)
{
	bdi_class = class_create(THIS_MODULE, "bdi");
234 235 236
	if (IS_ERR(bdi_class))
		return PTR_ERR(bdi_class);

237
	bdi_class->dev_groups = bdi_dev_groups;
238
	bdi_debug_init();
239 240
	return 0;
}
241
postcore_initcall(bdi_class_init);
242

243 244 245 246
static int __init default_bdi_init(void)
{
	int err;

247
	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
248
					      WQ_UNBOUND | WQ_SYSFS, 0);
249 250 251
	if (!bdi_wq)
		return -ENOMEM;

252
	err = bdi_init(&noop_backing_dev_info);
253 254 255 256 257

	return err;
}
subsys_initcall(default_bdi_init);

258 259 260 261 262
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
	return wb_has_dirty_io(&bdi->wb);
}

263 264 265 266 267 268 269 270 271 272
/*
 * This function is used when the first inode for this bdi is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
273 274 275
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
276 277 278 279 280 281
 */
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
	unsigned long timeout;

	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
J
Jan Kara 已提交
282
	spin_lock_bh(&bdi->wb_lock);
283
	if (test_bit(WB_registered, &bdi->wb.state))
J
Jan Kara 已提交
284 285
		queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
	spin_unlock_bh(&bdi->wb_lock);
286 287
}

288 289 290 291 292 293 294 295 296
/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
	spin_lock_bh(&bdi_lock);
	list_del_rcu(&bdi->bdi_list);
	spin_unlock_bh(&bdi_lock);

297
	synchronize_rcu_expedited();
298 299
}

300 301 302 303 304 305
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
		const char *fmt, ...)
{
	va_list args;
	struct device *dev;

306
	if (bdi->dev)	/* The driver needs to use separate queues per device */
307
		return 0;
308

309
	va_start(args, fmt);
310
	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
311
	va_end(args);
312 313
	if (IS_ERR(dev))
		return PTR_ERR(dev);
314

315 316
	bdi->dev = dev;

317
	bdi_debug_register(bdi, dev_name(dev));
318
	set_bit(WB_registered, &bdi->wb.state);
319 320 321 322 323

	spin_lock_bh(&bdi_lock);
	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
	spin_unlock_bh(&bdi_lock);

324
	trace_writeback_bdi_register(bdi);
325
	return 0;
326 327 328 329 330 331 332 333 334
}
EXPORT_SYMBOL(bdi_register);

int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
}
EXPORT_SYMBOL(bdi_register_dev);

335 336 337 338
/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
339
{
340 341
	/* Make sure nobody queues further work */
	spin_lock_bh(&bdi->wb_lock);
342
	if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) {
343
		spin_unlock_bh(&bdi->wb_lock);
344
		return;
345 346
	}
	spin_unlock_bh(&bdi->wb_lock);
347 348

	/*
349
	 * Make sure nobody finds us on the bdi_list anymore
350
	 */
351
	bdi_remove_from_list(bdi);
352 353

	/*
354 355 356
	 * Drain work list and shutdown the delayed_work.  At this point,
	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
	 * is dying and its work_list needs to be drained no matter what.
357
	 */
358 359
	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
	flush_delayed_work(&bdi->wb.dwork);
360 361
}

362
/*
363 364 365 366 367 368
 * Called when the device behind @bdi has been removed or ejected.
 *
 * We can't really do much here except for reducing the dirty ratio at
 * the moment.  In the future we should be able to set a flag so that
 * the filesystem can handle errors at mark_inode_dirty time instead
 * of only at writeback time.
369
 */
370 371
void bdi_unregister(struct backing_dev_info *bdi)
{
372 373
	if (WARN_ON_ONCE(!bdi->dev))
		return;
374

375
	bdi_set_min_ratio(bdi, 0);
376 377
}
EXPORT_SYMBOL(bdi_unregister);
378

379
static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
380
{
381 382
	int i, err;

383 384 385 386 387 388 389
	memset(wb, 0, sizeof(*wb));

	wb->bdi = bdi;
	wb->last_old_flush = jiffies;
	INIT_LIST_HEAD(&wb->b_dirty);
	INIT_LIST_HEAD(&wb->b_io);
	INIT_LIST_HEAD(&wb->b_more_io);
390
	INIT_LIST_HEAD(&wb->b_dirty_time);
391
	spin_lock_init(&wb->list_lock);
392
	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413

	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
		err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL);
		if (err) {
			while (--i)
				percpu_counter_destroy(&wb->stat[i]);
			return err;
		}
	}

	return 0;
}

static void bdi_wb_exit(struct bdi_writeback *wb)
{
	int i;

	WARN_ON(delayed_work_pending(&wb->dwork));

	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
		percpu_counter_destroy(&wb->stat[i]);
414 415
}

416 417 418 419 420
/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW		(100 << (20 - PAGE_SHIFT))

421 422
int bdi_init(struct backing_dev_info *bdi)
{
423
	int err;
424

425 426
	bdi->dev = NULL;

427
	bdi->min_ratio = 0;
428
	bdi->max_ratio = 100;
429
	bdi->max_prop_frac = FPROP_FRAC_BASE;
430
	spin_lock_init(&bdi->wb_lock);
431
	INIT_LIST_HEAD(&bdi->bdi_list);
432 433
	INIT_LIST_HEAD(&bdi->work_list);

434 435 436
	err = bdi_wb_init(&bdi->wb, bdi);
	if (err)
		return err;
P
Peter Zijlstra 已提交
437 438

	bdi->dirty_exceeded = 0;
439 440 441 442

	bdi->bw_time_stamp = jiffies;
	bdi->written_stamp = 0;

443
	bdi->balanced_dirty_ratelimit = INIT_BW;
W
Wu Fengguang 已提交
444
	bdi->dirty_ratelimit = INIT_BW;
445 446 447
	bdi->write_bandwidth = INIT_BW;
	bdi->avg_write_bandwidth = INIT_BW;

448
	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
P
Peter Zijlstra 已提交
449
	if (err) {
450 451
		bdi_wb_exit(&bdi->wb);
		return err;
452 453
	}

454
	return 0;
455 456 457 458 459
}
EXPORT_SYMBOL(bdi_init);

void bdi_destroy(struct backing_dev_info *bdi)
{
460
	bdi_wb_shutdown(bdi);
461

462
	WARN_ON(!list_empty(&bdi->work_list));
463

464 465 466 467 468 469
	if (bdi->dev) {
		bdi_debug_unregister(bdi);
		device_unregister(bdi->dev);
		bdi->dev = NULL;
	}

470 471
	bdi_wb_exit(&bdi->wb);

472
	fprop_local_destroy_percpu(&bdi->completions);
473 474 475
}
EXPORT_SYMBOL(bdi_destroy);

476 477 478 479
/*
 * For use from filesystems to quickly init and register a bdi associated
 * with dirty writeback
 */
480
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
481 482 483 484
{
	int err;

	bdi->name = name;
485
	bdi->capabilities = 0;
486 487 488 489
	err = bdi_init(bdi);
	if (err)
		return err;

490 491
	err = bdi_register(bdi, NULL, "%.28s-%ld", name,
			   atomic_long_inc_return(&bdi_seq));
492 493 494 495 496 497 498 499 500
	if (err) {
		bdi_destroy(bdi);
		return err;
	}

	return 0;
}
EXPORT_SYMBOL(bdi_setup_and_register);

501 502 503 504
static wait_queue_head_t congestion_wqh[2] = {
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
	};
505
static atomic_t nr_bdi_congested[2];
506

507
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
508
{
509
	enum wb_state bit;
510
	wait_queue_head_t *wqh = &congestion_wqh[sync];
511

512 513
	bit = sync ? WB_sync_congested : WB_async_congested;
	if (test_and_clear_bit(bit, &bdi->wb.state))
514
		atomic_dec(&nr_bdi_congested[sync]);
515
	smp_mb__after_atomic();
516 517 518 519 520
	if (waitqueue_active(wqh))
		wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);

521
void set_bdi_congested(struct backing_dev_info *bdi, int sync)
522
{
523
	enum wb_state bit;
524

525 526
	bit = sync ? WB_sync_congested : WB_async_congested;
	if (!test_and_set_bit(bit, &bdi->wb.state))
527
		atomic_inc(&nr_bdi_congested[sync]);
528 529 530 531 532
}
EXPORT_SYMBOL(set_bdi_congested);

/**
 * congestion_wait - wait for a backing_dev to become uncongested
533
 * @sync: SYNC or ASYNC IO
534 535 536 537 538 539
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
540
long congestion_wait(int sync, long timeout)
541 542
{
	long ret;
543
	unsigned long start = jiffies;
544
	DEFINE_WAIT(wait);
545
	wait_queue_head_t *wqh = &congestion_wqh[sync];
546 547 548 549

	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);
550 551 552 553

	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

554 555 556
	return ret;
}
EXPORT_SYMBOL(congestion_wait);
P
Peter Zijlstra 已提交
557

558 559 560 561 562 563 564 565 566 567 568
/**
 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 * @zone: A zone to check if it is heavily congested
 * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * In the event of a congested backing_dev (any backing_dev) and the given
 * @zone has experienced recent congestion, this waits for up to @timeout
 * jiffies for either a BDI to exit congestion of the given @sync queue
 * or a write to complete.
 *
L
Lucas De Marchi 已提交
569
 * In the absence of zone congestion, cond_resched() is called to yield
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
 * the processor if necessary but otherwise does not sleep.
 *
 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 * it is the number of jiffies that were still remaining when the function
 * returned. return_value == timeout implies the function did not sleep.
 */
long wait_iff_congested(struct zone *zone, int sync, long timeout)
{
	long ret;
	unsigned long start = jiffies;
	DEFINE_WAIT(wait);
	wait_queue_head_t *wqh = &congestion_wqh[sync];

	/*
	 * If there is no congestion, or heavy congestion is not being
	 * encountered in the current zone, yield if necessary instead
	 * of sleeping on the congestion queue
	 */
	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
J
Johannes Weiner 已提交
589
	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
		cond_resched();

		/* In case we scheduled, work out time remaining */
		ret = timeout - (jiffies - start);
		if (ret < 0)
			ret = 0;

		goto out;
	}

	/* Sleep until uncongested or a write happens */
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);

out:
	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

	return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
612 613 614 615 616 617

int pdflush_proc_obsolete(struct ctl_table *table, int write,
			void __user *buffer, size_t *lenp, loff_t *ppos)
{
	char kbuf[] = "0\n";

618
	if (*ppos || *lenp < sizeof(kbuf)) {
619 620 621 622 623 624 625 626 627 628 629 630 631
		*lenp = 0;
		return 0;
	}

	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
		return -EFAULT;
	printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
			table->procname);

	*lenp = 2;
	*ppos += *lenp;
	return 2;
}