backing-dev.c 15.2 KB
Newer Older
1 2 3

#include <linux/wait.h>
#include <linux/backing-dev.h>
4 5
#include <linux/kthread.h>
#include <linux/freezer.h>
6
#include <linux/fs.h>
7
#include <linux/pagemap.h>
8
#include <linux/mm.h>
9 10
#include <linux/sched.h>
#include <linux/module.h>
11 12
#include <linux/writeback.h>
#include <linux/device.h>
13
#include <trace/events/writeback.h>
14

15 16
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

J
Jörn Engel 已提交
17 18
struct backing_dev_info noop_backing_dev_info = {
	.name		= "noop",
19
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
J
Jörn Engel 已提交
20 21
};

22
static struct class *bdi_class;
23 24

/*
25
 * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
26 27
 * locking.
 */
28
DEFINE_SPINLOCK(bdi_lock);
29
LIST_HEAD(bdi_list);
30

31 32 33
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

34 35 36 37 38 39 40 41 42 43 44 45 46 47
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
	bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
	struct backing_dev_info *bdi = m->private;
C
Christoph Hellwig 已提交
48
	struct bdi_writeback *wb = &bdi->wb;
49 50 51
	unsigned long background_thresh;
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
52
	unsigned long nr_dirty, nr_io, nr_more_io;
53 54
	struct inode *inode;

55
	nr_dirty = nr_io = nr_more_io = 0;
56
	spin_lock(&wb->list_lock);
N
Nick Piggin 已提交
57
	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
C
Christoph Hellwig 已提交
58
		nr_dirty++;
N
Nick Piggin 已提交
59
	list_for_each_entry(inode, &wb->b_io, i_wb_list)
C
Christoph Hellwig 已提交
60
		nr_io++;
N
Nick Piggin 已提交
61
	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
C
Christoph Hellwig 已提交
62
		nr_more_io++;
63
	spin_unlock(&wb->list_lock);
64

65 66
	global_dirty_limits(&background_thresh, &dirty_thresh);
	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
67 68 69

#define K(x) ((x) << (PAGE_SHIFT - 10))
	seq_printf(m,
70 71 72 73 74
		   "BdiWriteback:       %10lu kB\n"
		   "BdiReclaimable:     %10lu kB\n"
		   "BdiDirtyThresh:     %10lu kB\n"
		   "DirtyThresh:        %10lu kB\n"
		   "BackgroundThresh:   %10lu kB\n"
75
		   "BdiDirtied:         %10lu kB\n"
76 77 78 79 80 81 82
		   "BdiWritten:         %10lu kB\n"
		   "BdiWriteBandwidth:  %10lu kBps\n"
		   "b_dirty:            %10lu\n"
		   "b_io:               %10lu\n"
		   "b_more_io:          %10lu\n"
		   "bdi_list:           %10u\n"
		   "state:              %10lx\n",
83 84
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
85 86 87
		   K(bdi_thresh),
		   K(dirty_thresh),
		   K(background_thresh),
88
		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
89
		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
90
		   (unsigned long) K(bdi->write_bandwidth),
91 92 93
		   nr_dirty,
		   nr_io,
		   nr_more_io,
C
Christoph Hellwig 已提交
94
		   !list_empty(&bdi->bdi_list), bdi->state);
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
#undef K

	return 0;
}

static int bdi_debug_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, bdi_debug_stats_show, inode->i_private);
}

static const struct file_operations bdi_debug_stats_fops = {
	.open		= bdi_debug_stats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
					       bdi, &bdi_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
	debugfs_remove(bdi->debug_stats);
	debugfs_remove(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
				      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif

137 138 139 140 141 142
static ssize_t read_ahead_kb_store(struct device *dev,
				  struct device_attribute *attr,
				  const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned long read_ahead_kb;
143
	ssize_t ret;
144

145 146 147 148 149 150 151
	ret = kstrtoul(buf, 10, &read_ahead_kb);
	if (ret < 0)
		return ret;

	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

	return count;
152 153 154 155 156 157 158 159 160 161 162
}

#define K(pages) ((pages) << (PAGE_SHIFT - 10))

#define BDI_SHOW(name, expr)						\
static ssize_t name##_show(struct device *dev,				\
			   struct device_attribute *attr, char *page)	\
{									\
	struct backing_dev_info *bdi = dev_get_drvdata(dev);		\
									\
	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);	\
163 164
}									\
static DEVICE_ATTR_RW(name);
165 166 167

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

168 169 170 171 172
static ssize_t min_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
173 174 175 176 177 178 179 180 181
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_min_ratio(bdi, ratio);
	if (!ret)
		ret = count;
182 183 184 185 186

	return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)

187 188 189 190 191
static ssize_t max_ratio_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t count)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);
	unsigned int ratio;
192 193 194 195 196 197 198 199 200
	ssize_t ret;

	ret = kstrtouint(buf, 10, &ratio);
	if (ret < 0)
		return ret;

	ret = bdi_set_max_ratio(bdi, ratio);
	if (!ret)
		ret = count;
201 202 203 204 205

	return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)

206 207 208 209 210 211 212 213 214
static ssize_t stable_pages_required_show(struct device *dev,
					  struct device_attribute *attr,
					  char *page)
{
	struct backing_dev_info *bdi = dev_get_drvdata(dev);

	return snprintf(page, PAGE_SIZE-1, "%d\n",
			bdi_cap_stable_pages_required(bdi) ? 1 : 0);
}
215 216 217 218 219 220 221 222
static DEVICE_ATTR_RO(stable_pages_required);

static struct attribute *bdi_dev_attrs[] = {
	&dev_attr_read_ahead_kb.attr,
	&dev_attr_min_ratio.attr,
	&dev_attr_max_ratio.attr,
	&dev_attr_stable_pages_required.attr,
	NULL,
223
};
224
ATTRIBUTE_GROUPS(bdi_dev);
225 226 227 228

static __init int bdi_class_init(void)
{
	bdi_class = class_create(THIS_MODULE, "bdi");
229 230 231
	if (IS_ERR(bdi_class))
		return PTR_ERR(bdi_class);

232
	bdi_class->dev_groups = bdi_dev_groups;
233
	bdi_debug_init();
234 235
	return 0;
}
236
postcore_initcall(bdi_class_init);
237

238 239 240 241
static int __init default_bdi_init(void)
{
	int err;

242
	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
243
					      WQ_UNBOUND | WQ_SYSFS, 0);
244 245 246
	if (!bdi_wq)
		return -ENOMEM;

247
	err = bdi_init(&noop_backing_dev_info);
248 249 250 251 252

	return err;
}
subsys_initcall(default_bdi_init);

253 254 255 256 257
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
	return wb_has_dirty_io(&bdi->wb);
}

258 259 260 261 262 263 264 265 266 267
/*
 * This function is used when the first inode for this bdi is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
268 269 270
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
271 272 273 274 275 276
 */
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
	unsigned long timeout;

	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
J
Jan Kara 已提交
277 278 279 280
	spin_lock_bh(&bdi->wb_lock);
	if (test_bit(BDI_registered, &bdi->state))
		queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
	spin_unlock_bh(&bdi->wb_lock);
281 282
}

283 284 285 286 287 288 289 290 291
/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
	spin_lock_bh(&bdi_lock);
	list_del_rcu(&bdi->bdi_list);
	spin_unlock_bh(&bdi_lock);

292
	synchronize_rcu_expedited();
293 294
}

295 296 297 298 299 300
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
		const char *fmt, ...)
{
	va_list args;
	struct device *dev;

301
	if (bdi->dev)	/* The driver needs to use separate queues per device */
302
		return 0;
303

304
	va_start(args, fmt);
305
	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
306
	va_end(args);
307 308
	if (IS_ERR(dev))
		return PTR_ERR(dev);
309

310 311
	bdi->dev = dev;

312
	bdi_debug_register(bdi, dev_name(dev));
313
	set_bit(BDI_registered, &bdi->state);
314 315 316 317 318

	spin_lock_bh(&bdi_lock);
	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
	spin_unlock_bh(&bdi_lock);

319
	trace_writeback_bdi_register(bdi);
320
	return 0;
321 322 323 324 325 326 327 328 329
}
EXPORT_SYMBOL(bdi_register);

int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
}
EXPORT_SYMBOL(bdi_register_dev);

330 331 332 333
/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
334
{
335 336 337 338
	/* Make sure nobody queues further work */
	spin_lock_bh(&bdi->wb_lock);
	if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
		spin_unlock_bh(&bdi->wb_lock);
339
		return;
340 341
	}
	spin_unlock_bh(&bdi->wb_lock);
342 343

	/*
344
	 * Make sure nobody finds us on the bdi_list anymore
345
	 */
346
	bdi_remove_from_list(bdi);
347 348

	/*
349 350 351
	 * Drain work list and shutdown the delayed_work.  At this point,
	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
	 * is dying and its work_list needs to be drained no matter what.
352
	 */
353 354
	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
	flush_delayed_work(&bdi->wb.dwork);
355 356
}

357
/*
358 359 360 361 362 363
 * Called when the device behind @bdi has been removed or ejected.
 *
 * We can't really do much here except for reducing the dirty ratio at
 * the moment.  In the future we should be able to set a flag so that
 * the filesystem can handle errors at mark_inode_dirty time instead
 * of only at writeback time.
364
 */
365 366
void bdi_unregister(struct backing_dev_info *bdi)
{
367 368
	if (WARN_ON_ONCE(!bdi->dev))
		return;
369

370
	bdi_set_min_ratio(bdi, 0);
371 372
}
EXPORT_SYMBOL(bdi_unregister);
373

374 375 376 377 378 379 380 381 382
static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
{
	memset(wb, 0, sizeof(*wb));

	wb->bdi = bdi;
	wb->last_old_flush = jiffies;
	INIT_LIST_HEAD(&wb->b_dirty);
	INIT_LIST_HEAD(&wb->b_io);
	INIT_LIST_HEAD(&wb->b_more_io);
383
	spin_lock_init(&wb->list_lock);
384
	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
385 386
}

387 388 389 390 391
/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW		(100 << (20 - PAGE_SHIFT))

392 393
int bdi_init(struct backing_dev_info *bdi)
{
394
	int i, err;
395

396 397
	bdi->dev = NULL;

398
	bdi->min_ratio = 0;
399
	bdi->max_ratio = 100;
400
	bdi->max_prop_frac = FPROP_FRAC_BASE;
401
	spin_lock_init(&bdi->wb_lock);
402
	INIT_LIST_HEAD(&bdi->bdi_list);
403 404 405 406
	INIT_LIST_HEAD(&bdi->work_list);

	bdi_wb_init(&bdi->wb, bdi);

407
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
408
		err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
P
Peter Zijlstra 已提交
409 410 411 412 413
		if (err)
			goto err;
	}

	bdi->dirty_exceeded = 0;
414 415 416 417

	bdi->bw_time_stamp = jiffies;
	bdi->written_stamp = 0;

418
	bdi->balanced_dirty_ratelimit = INIT_BW;
W
Wu Fengguang 已提交
419
	bdi->dirty_ratelimit = INIT_BW;
420 421 422
	bdi->write_bandwidth = INIT_BW;
	bdi->avg_write_bandwidth = INIT_BW;

423
	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
P
Peter Zijlstra 已提交
424 425 426

	if (err) {
err:
427
		while (i--)
P
Peter Zijlstra 已提交
428
			percpu_counter_destroy(&bdi->bdi_stat[i]);
429 430 431 432 433 434 435 436 437 438
	}

	return err;
}
EXPORT_SYMBOL(bdi_init);

void bdi_destroy(struct backing_dev_info *bdi)
{
	int i;

439
	bdi_wb_shutdown(bdi);
440

441
	WARN_ON(!list_empty(&bdi->work_list));
442
	WARN_ON(delayed_work_pending(&bdi->wb.dwork));
443

444 445 446 447 448 449
	if (bdi->dev) {
		bdi_debug_unregister(bdi);
		device_unregister(bdi->dev);
		bdi->dev = NULL;
	}

450 451
	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
		percpu_counter_destroy(&bdi->bdi_stat[i]);
452
	fprop_local_destroy_percpu(&bdi->completions);
453 454 455
}
EXPORT_SYMBOL(bdi_destroy);

456 457 458 459
/*
 * For use from filesystems to quickly init and register a bdi associated
 * with dirty writeback
 */
460
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
461 462 463 464
{
	int err;

	bdi->name = name;
465
	bdi->capabilities = 0;
466 467 468 469
	err = bdi_init(bdi);
	if (err)
		return err;

470 471
	err = bdi_register(bdi, NULL, "%.28s-%ld", name,
			   atomic_long_inc_return(&bdi_seq));
472 473 474 475 476 477 478 479 480
	if (err) {
		bdi_destroy(bdi);
		return err;
	}

	return 0;
}
EXPORT_SYMBOL(bdi_setup_and_register);

481 482 483 484
static wait_queue_head_t congestion_wqh[2] = {
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
	};
485
static atomic_t nr_bdi_congested[2];
486

487
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
488 489
{
	enum bdi_state bit;
490
	wait_queue_head_t *wqh = &congestion_wqh[sync];
491

492
	bit = sync ? BDI_sync_congested : BDI_async_congested;
493 494
	if (test_and_clear_bit(bit, &bdi->state))
		atomic_dec(&nr_bdi_congested[sync]);
495
	smp_mb__after_atomic();
496 497 498 499 500
	if (waitqueue_active(wqh))
		wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);

501
void set_bdi_congested(struct backing_dev_info *bdi, int sync)
502 503 504
{
	enum bdi_state bit;

505
	bit = sync ? BDI_sync_congested : BDI_async_congested;
506 507
	if (!test_and_set_bit(bit, &bdi->state))
		atomic_inc(&nr_bdi_congested[sync]);
508 509 510 511 512
}
EXPORT_SYMBOL(set_bdi_congested);

/**
 * congestion_wait - wait for a backing_dev to become uncongested
513
 * @sync: SYNC or ASYNC IO
514 515 516 517 518 519
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
520
long congestion_wait(int sync, long timeout)
521 522
{
	long ret;
523
	unsigned long start = jiffies;
524
	DEFINE_WAIT(wait);
525
	wait_queue_head_t *wqh = &congestion_wqh[sync];
526 527 528 529

	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);
530 531 532 533

	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

534 535 536
	return ret;
}
EXPORT_SYMBOL(congestion_wait);
P
Peter Zijlstra 已提交
537

538 539 540 541 542 543 544 545 546 547 548
/**
 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 * @zone: A zone to check if it is heavily congested
 * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * In the event of a congested backing_dev (any backing_dev) and the given
 * @zone has experienced recent congestion, this waits for up to @timeout
 * jiffies for either a BDI to exit congestion of the given @sync queue
 * or a write to complete.
 *
L
Lucas De Marchi 已提交
549
 * In the absence of zone congestion, cond_resched() is called to yield
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
 * the processor if necessary but otherwise does not sleep.
 *
 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 * it is the number of jiffies that were still remaining when the function
 * returned. return_value == timeout implies the function did not sleep.
 */
long wait_iff_congested(struct zone *zone, int sync, long timeout)
{
	long ret;
	unsigned long start = jiffies;
	DEFINE_WAIT(wait);
	wait_queue_head_t *wqh = &congestion_wqh[sync];

	/*
	 * If there is no congestion, or heavy congestion is not being
	 * encountered in the current zone, yield if necessary instead
	 * of sleeping on the congestion queue
	 */
	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
J
Johannes Weiner 已提交
569
	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
		cond_resched();

		/* In case we scheduled, work out time remaining */
		ret = timeout - (jiffies - start);
		if (ret < 0)
			ret = 0;

		goto out;
	}

	/* Sleep until uncongested or a write happens */
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	ret = io_schedule_timeout(timeout);
	finish_wait(wqh, &wait);

out:
	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
					jiffies_to_usecs(jiffies - start));

	return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
592 593 594 595 596 597

int pdflush_proc_obsolete(struct ctl_table *table, int write,
			void __user *buffer, size_t *lenp, loff_t *ppos)
{
	char kbuf[] = "0\n";

598
	if (*ppos || *lenp < sizeof(kbuf)) {
599 600 601 602 603 604 605 606 607 608 609 610 611
		*lenp = 0;
		return 0;
	}

	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
		return -EFAULT;
	printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
			table->procname);

	*lenp = 2;
	*ppos += *lenp;
	return 2;
}